001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017:
018: package org.apache.poi.hwpf;
019:
020: import java.io.InputStream;
021: import java.io.FileInputStream;
022: import java.io.FileNotFoundException;
023: import java.io.PushbackInputStream;
024: import java.io.IOException;
025: import java.io.OutputStream;
026: import java.io.ByteArrayInputStream;
027:
028: import java.util.Iterator;
029:
030: import org.apache.poi.POIDocument;
031: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
032: import org.apache.poi.poifs.filesystem.DocumentEntry;
033: import org.apache.poi.poifs.common.POIFSConstants;
034:
035: import org.apache.poi.hwpf.model.*;
036: import org.apache.poi.hwpf.model.io.*;
037: import org.apache.poi.hwpf.usermodel.*;
038:
039: /**
040: *
041: * This class acts as the bucket that we throw all of the Word data structures
042: * into.
043: *
044: * @author Ryan Ackley
045: */
046: public class HWPFDocument extends POIDocument
047: // implements Cloneable
048: {
049: /** The FIB*/
050: protected FileInformationBlock _fib;
051:
052: /** main document stream buffer*/
053: private byte[] _mainStream;
054:
055: /** table stream buffer*/
056: private byte[] _tableStream;
057:
058: /** data stream buffer*/
059: protected byte[] _dataStream;
060:
061: /** Document wide Properties*/
062: protected DocumentProperties _dop;
063:
064: /** Contains text of the document wrapped in a obfuscated Word data
065: * structure*/
066: protected ComplexFileTable _cft;
067:
068: protected TextPieceTable _tpt;
069:
070: /** Contains formatting properties for text*/
071: protected CHPBinTable _cbt;
072:
073: /** Contains formatting properties for paragraphs*/
074: protected PAPBinTable _pbt;
075:
076: /** Contains formatting properties for sections.*/
077: protected SectionTable _st;
078:
079: /** Holds styles for this document.*/
080: protected StyleSheet _ss;
081:
082: /** Holds fonts for this document.*/
083: protected FontTable _ft;
084:
085: /** Hold list tables */
086: protected ListTables _lt;
087:
088: /** Holds the save history for this document. */
089: protected SavedByTable _sbt;
090:
091: /** Holds pictures table */
092: protected PicturesTable _pictures;
093:
094: protected HWPFDocument() {
095:
096: }
097:
098: /**
099: * Takens an InputStream, verifies that it's not RTF, builds a
100: * POIFSFileSystem from it, and returns that.
101: */
102: public static POIFSFileSystem verifyAndBuildPOIFS(
103: InputStream istream) throws IOException {
104: // Open a PushbackInputStream, so we can peek at the first few bytes
105: PushbackInputStream pis = new PushbackInputStream(istream, 6);
106: byte[] first6 = new byte[6];
107: pis.read(first6);
108:
109: // Does it start with {\rtf ? If so, it's really RTF
110: if (first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
111: && first6[3] == 't' && first6[4] == 'f') {
112: throw new IllegalArgumentException(
113: "The document is really a RTF file");
114: }
115:
116: // OK, so it's not RTF
117: // Open a POIFSFileSystem on the (pushed back) stream
118: pis.unread(first6);
119: return new POIFSFileSystem(pis);
120: }
121:
122: /**
123: * This constructor loads a Word document from an InputStream.
124: *
125: * @param istream The InputStream that contains the Word document.
126: * @throws IOException If there is an unexpected IOException from the passed
127: * in InputStream.
128: */
129: public HWPFDocument(InputStream istream) throws IOException {
130: //do Ole stuff
131: this (verifyAndBuildPOIFS(istream));
132: }
133:
134: /**
135: * This constructor loads a Word document from a POIFSFileSystem
136: *
137: * @param pfilesystem The POIFSFileSystem that contains the Word document.
138: * @throws IOException If there is an unexpected IOException from the passed
139: * in POIFSFileSystem.
140: */
141: public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException {
142: // Sort out the hpsf properties
143: filesystem = pfilesystem;
144: readProperties();
145:
146: // read in the main stream.
147: DocumentEntry documentProps = (DocumentEntry) filesystem
148: .getRoot().getEntry("WordDocument");
149: _mainStream = new byte[documentProps.getSize()];
150: filesystem.createDocumentInputStream("WordDocument").read(
151: _mainStream);
152:
153: // use the fib to determine the name of the table stream.
154: _fib = new FileInformationBlock(_mainStream);
155:
156: String name = "0Table";
157: if (_fib.isFWhichTblStm()) {
158: name = "1Table";
159: }
160:
161: // Grab the table stream.
162: DocumentEntry tableProps;
163: try {
164: tableProps = (DocumentEntry) filesystem.getRoot().getEntry(
165: name);
166: } catch (FileNotFoundException fnfe) {
167: throw new IllegalStateException(
168: "Table Stream '"
169: + name
170: + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)");
171: }
172:
173: // read in the table stream.
174: _tableStream = new byte[tableProps.getSize()];
175: filesystem.createDocumentInputStream(name).read(_tableStream);
176:
177: _fib.fillVariableFields(_mainStream, _tableStream);
178:
179: // read in the data stream.
180: try {
181: DocumentEntry dataProps = (DocumentEntry) filesystem
182: .getRoot().getEntry("Data");
183: _dataStream = new byte[dataProps.getSize()];
184: filesystem.createDocumentInputStream("Data").read(
185: _dataStream);
186: } catch (java.io.FileNotFoundException e) {
187: _dataStream = new byte[0];
188: }
189:
190: // read in the pictures stream
191: _pictures = new PicturesTable(_dataStream);
192:
193: // get the start of text in the main stream
194: int fcMin = _fib.getFcMin();
195:
196: // load up our standard structures.
197: _dop = new DocumentProperties(_tableStream, _fib.getFcDop());
198: _cft = new ComplexFileTable(_mainStream, _tableStream, _fib
199: .getFcClx(), fcMin);
200: _tpt = _cft.getTextPieceTable();
201: _cbt = new CHPBinTable(_mainStream, _tableStream, _fib
202: .getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), fcMin);
203: _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream,
204: _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(),
205: fcMin);
206:
207: // Word XP puts in a zero filled buffer in front of the text and it screws
208: // up my system for offsets. This is an adjustment.
209: int cpMin = _tpt.getCpMin();
210: if (cpMin > 0) {
211: _cbt.adjustForDelete(0, 0, cpMin);
212: _pbt.adjustForDelete(0, 0, cpMin);
213: }
214:
215: _st = new SectionTable(_mainStream, _tableStream, _fib
216: .getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin,
217: getTextTable().getTextPieces());
218: _ss = new StyleSheet(_tableStream, _fib.getFcStshf());
219: _ft = new FontTable(_tableStream, _fib.getFcSttbfffn(), _fib
220: .getLcbSttbfffn());
221:
222: int listOffset = _fib.getFcPlcfLst();
223: int lfoOffset = _fib.getFcPlfLfo();
224: if (listOffset != 0 && _fib.getLcbPlcfLst() != 0) {
225: _lt = new ListTables(_tableStream, _fib.getFcPlcfLst(),
226: _fib.getFcPlfLfo());
227: }
228:
229: int sbtOffset = _fib.getFcSttbSavedBy();
230: int sbtLength = _fib.getLcbSttbSavedBy();
231: if (sbtOffset != 0 && sbtLength != 0) {
232: _sbt = new SavedByTable(_tableStream, sbtOffset, sbtLength);
233: }
234:
235: PlexOfCps plc = new PlexOfCps(_tableStream, _fib
236: .getFcPlcffldMom(), _fib.getLcbPlcffldMom(), 2);
237: for (int x = 0; x < plc.length(); x++) {
238: GenericPropertyNode node = plc.getProperty(x);
239: byte[] fld = node.getBytes();
240: int breakpoint = 0;
241: }
242: }
243:
244: public StyleSheet getStyleSheet() {
245: return _ss;
246: }
247:
248: public FileInformationBlock getFileInformationBlock() {
249: return _fib;
250: }
251:
252: public DocumentProperties getDocProperties() {
253: return _dop;
254: }
255:
256: public Range getRange() {
257: // hack to get the ending cp of the document, Have to revisit this.
258: java.util.List text = _tpt.getTextPieces();
259: PropertyNode p = (PropertyNode) text.get(text.size() - 1);
260:
261: return new Range(0, p.getEnd(), this );
262: }
263:
264: /**
265: * Returns the character length of a document.
266: * @return the character length of a document
267: */
268: public int characterLength() {
269: java.util.List textPieces = _tpt.getTextPieces();
270: Iterator textIt = textPieces.iterator();
271:
272: int length = 0;
273: while (textIt.hasNext()) {
274: TextPiece tp = (TextPiece) textIt.next();
275: length += tp.characterLength();
276: }
277: return length;
278: }
279:
280: public ListTables getListTables() {
281: return _lt;
282: }
283:
284: /**
285: * Gets a reference to the saved -by table, which holds the save history for the document.
286: *
287: * @return the saved-by table.
288: */
289: public SavedByTable getSavedByTable() {
290: return _sbt;
291: }
292:
293: /**
294: * @return PicturesTable object, that is able to extract images from this document
295: */
296: public PicturesTable getPicturesTable() {
297: return _pictures;
298: }
299:
300: /**
301: * Writes out the word file that is represented by an instance of this class.
302: *
303: * @param out The OutputStream to write to.
304: * @throws IOException If there is an unexpected IOException from the passed
305: * in OutputStream.
306: */
307: public void write(OutputStream out) throws IOException {
308: // initialize our streams for writing.
309: HWPFFileSystem docSys = new HWPFFileSystem();
310: HWPFOutputStream mainStream = docSys.getStream("WordDocument");
311: HWPFOutputStream tableStream = docSys.getStream("1Table");
312: //HWPFOutputStream dataStream = docSys.getStream("Data");
313: int tableOffset = 0;
314:
315: // FileInformationBlock fib = (FileInformationBlock)_fib.clone();
316: // clear the offsets and sizes in our FileInformationBlock.
317: _fib.clearOffsetsSizes();
318:
319: // determine the FileInformationBLock size
320: int fibSize = _fib.getSize();
321: fibSize += POIFSConstants.BIG_BLOCK_SIZE
322: - (fibSize % POIFSConstants.BIG_BLOCK_SIZE);
323:
324: // preserve space for the FileInformationBlock because we will be writing
325: // it after we write everything else.
326: byte[] placeHolder = new byte[fibSize];
327: mainStream.write(placeHolder);
328: int mainOffset = mainStream.getOffset();
329:
330: // write out the StyleSheet.
331: _fib.setFcStshf(tableOffset);
332: _ss.writeTo(tableStream);
333: _fib.setLcbStshf(tableStream.getOffset() - tableOffset);
334: tableOffset = tableStream.getOffset();
335:
336: // get fcMin and fcMac because we will be writing the actual text with the
337: // complex table.
338: int fcMin = mainOffset;
339:
340: // write out the Complex table, includes text.
341: _fib.setFcClx(tableOffset);
342: _cft.writeTo(docSys);
343: _fib.setLcbClx(tableStream.getOffset() - tableOffset);
344: tableOffset = tableStream.getOffset();
345: int fcMac = mainStream.getOffset();
346:
347: // write out the CHPBinTable.
348: _fib.setFcPlcfbteChpx(tableOffset);
349: _cbt.writeTo(docSys, fcMin);
350: _fib.setLcbPlcfbteChpx(tableStream.getOffset() - tableOffset);
351: tableOffset = tableStream.getOffset();
352:
353: // write out the PAPBinTable.
354: _fib.setFcPlcfbtePapx(tableOffset);
355: _pbt.writeTo(docSys, fcMin);
356: _fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset);
357: tableOffset = tableStream.getOffset();
358:
359: // write out the SectionTable.
360: _fib.setFcPlcfsed(tableOffset);
361: _st.writeTo(docSys, fcMin);
362: _fib.setLcbPlcfsed(tableStream.getOffset() - tableOffset);
363: tableOffset = tableStream.getOffset();
364:
365: // write out the list tables
366: if (_lt != null) {
367: _fib.setFcPlcfLst(tableOffset);
368: _lt.writeListDataTo(tableStream);
369: _fib.setLcbPlcfLst(tableStream.getOffset() - tableOffset);
370:
371: _fib.setFcPlfLfo(tableStream.getOffset());
372: _lt.writeListOverridesTo(tableStream);
373: _fib.setLcbPlfLfo(tableStream.getOffset() - tableOffset);
374: tableOffset = tableStream.getOffset();
375: }
376:
377: // write out the saved-by table.
378: if (_sbt != null) {
379: _fib.setFcSttbSavedBy(tableOffset);
380: _sbt.writeTo(tableStream);
381: _fib.setLcbSttbSavedBy(tableStream.getOffset()
382: - tableOffset);
383:
384: tableOffset = tableStream.getOffset();
385: }
386:
387: // write out the FontTable.
388: _fib.setFcSttbfffn(tableOffset);
389: _ft.writeTo(docSys);
390: _fib.setLcbSttbfffn(tableStream.getOffset() - tableOffset);
391: tableOffset = tableStream.getOffset();
392:
393: // write out the DocumentProperties.
394: _fib.setFcDop(tableOffset);
395: byte[] buf = new byte[_dop.getSize()];
396: _fib.setLcbDop(_dop.getSize());
397: _dop.serialize(buf, 0);
398: tableStream.write(buf);
399:
400: // set some variables in the FileInformationBlock.
401: _fib.setFcMin(fcMin);
402: _fib.setFcMac(fcMac);
403: _fib.setCbMac(mainStream.getOffset());
404:
405: // make sure that the table, doc and data streams use big blocks.
406: byte[] mainBuf = mainStream.toByteArray();
407: if (mainBuf.length < 4096) {
408: byte[] tempBuf = new byte[4096];
409: System.arraycopy(mainBuf, 0, tempBuf, 0, mainBuf.length);
410: mainBuf = tempBuf;
411: }
412:
413: // write out the FileInformationBlock.
414: //_fib.serialize(mainBuf, 0);
415: _fib.writeTo(mainBuf, tableStream);
416:
417: byte[] tableBuf = tableStream.toByteArray();
418: if (tableBuf.length < 4096) {
419: byte[] tempBuf = new byte[4096];
420: System.arraycopy(tableBuf, 0, tempBuf, 0, tableBuf.length);
421: tableBuf = tempBuf;
422: }
423:
424: byte[] dataBuf = _dataStream;
425: if (dataBuf == null) {
426: dataBuf = new byte[4096];
427: }
428: if (dataBuf.length < 4096) {
429: byte[] tempBuf = new byte[4096];
430: System.arraycopy(dataBuf, 0, tempBuf, 0, dataBuf.length);
431: dataBuf = tempBuf;
432: }
433:
434: // spit out the Word document.
435: POIFSFileSystem pfs = new POIFSFileSystem();
436: pfs.createDocument(new ByteArrayInputStream(mainBuf),
437: "WordDocument");
438: pfs
439: .createDocument(new ByteArrayInputStream(tableBuf),
440: "1Table");
441: pfs.createDocument(new ByteArrayInputStream(dataBuf), "Data");
442:
443: pfs.writeFilesystem(out);
444: }
445:
446: public CHPBinTable getCharacterTable() {
447: return _cbt;
448: }
449:
450: public PAPBinTable getParagraphTable() {
451: return _pbt;
452: }
453:
454: public SectionTable getSectionTable() {
455: return _st;
456: }
457:
458: public TextPieceTable getTextTable() {
459: return _cft.getTextPieceTable();
460: }
461:
462: public byte[] getDataStream() {
463: return _dataStream;
464: }
465:
466: public int registerList(HWPFList list) {
467: if (_lt == null) {
468: _lt = new ListTables();
469: }
470: return _lt.addList(list.getListData(), list.getOverride());
471: }
472:
473: public FontTable getFontTable() {
474: return _ft;
475: }
476:
477: public void delete(int start, int length) {
478: Range r = new Range(start, start + length, this );
479: r.delete();
480: }
481:
482: /**
483: * Takes two arguments, 1) name of the Word file to read in 2) location to
484: * write it out at.
485: * @param args
486: */
487: public static void main(String[] args) {
488:
489: try {
490: HWPFDocument doc = new HWPFDocument(new FileInputStream(
491: args[0]));
492: Range r = doc.getRange();
493: String str = r.text();
494: int x = 0;
495: // CharacterRun run = new CharacterRun();
496: // run.setBold(true);
497: // run.setItalic(true);
498: // run.setCapitalized(true);
499: //
500: // Range range = doc.getRange();
501: // range.insertBefore("Hello World!!! HAHAHAHAHA I DID IT!!!", run);
502: //
503: // OutputStream out = new FileOutputStream(args[1]);
504: // doc.write(out);
505: //
506: // out.flush();
507: // out.close();
508:
509: } catch (Throwable t) {
510: t.printStackTrace();
511: }
512: }
513:
514: // public Object clone()
515: // throws CloneNotSupportedException
516: // {
517: // _tpt;
518: //
519: // _cbt;
520: //
521: // _pbt;
522: //
523: // _st;
524: //
525: // }
526: }
|