001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017:
018: /*
019: * HDFObjectFactory.java
020: *
021: * Created on February 24, 2002, 2:17 PM
022: */
023:
024: package org.apache.poi.hdf.model;
025:
026: //import java.io;
027:
028: import java.util.ArrayList;
029: import java.io.InputStream;
030: import java.io.FileInputStream;
031: import java.io.IOException;
032: import java.util.List;
033: import java.util.TreeSet;
034:
035: import org.apache.poi.hdf.model.hdftypes.*;
036: import org.apache.poi.hdf.event.HDFLowLevelParsingListener;
037: import org.apache.poi.hdf.model.util.BTreeSet;
038: import org.apache.poi.hdf.model.util.ParsingState;
039:
040: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
041: import org.apache.poi.poifs.filesystem.POIFSDocument;
042: import org.apache.poi.poifs.filesystem.DocumentEntry;
043: import org.apache.poi.util.LittleEndian;
044:
045: /**
046: * The Object Factory takes in a stream and creates the low level objects
047: * that represent the data.
048: * @author andy
049: */
050: public class HDFObjectFactory {
051:
052: /** OLE stuff*/
053: private POIFSFileSystem _filesystem;
054: /** The FIB*/
055: private FileInformationBlock _fib;
056:
057: /** Used to set up the object model*/
058: private HDFLowLevelParsingListener _listener;
059: /** parsing state for characters */
060: private ParsingState _charParsingState;
061: /** parsing state for paragraphs */
062: private ParsingState _parParsingState;
063:
064: /** main document stream buffer*/
065: byte[] _mainDocument;
066: /** table stream buffer*/
067: byte[] _tableBuffer;
068:
069: public static void main(String args[]) {
070: try {
071: HDFObjectFactory f = new HDFObjectFactory(
072: new FileInputStream("c:\\test.doc"));
073: int k = 0;
074: } catch (Throwable t) {
075: t.printStackTrace();
076: }
077: }
078:
079: /** Creates a new instance of HDFObjectFactory
080: *
081: * @param istream The InputStream that is the Word document
082: *
083: */
084: protected HDFObjectFactory(InputStream istream,
085: HDFLowLevelParsingListener l) throws IOException {
086: if (l == null) {
087: _listener = new HDFObjectModel();
088: } else {
089: _listener = l;
090: }
091:
092: //do Ole stuff
093: _filesystem = new POIFSFileSystem(istream);
094:
095: DocumentEntry headerProps = (DocumentEntry) _filesystem
096: .getRoot().getEntry("WordDocument");
097:
098: _mainDocument = new byte[headerProps.getSize()];
099: _filesystem.createDocumentInputStream("WordDocument").read(
100: _mainDocument);
101:
102: _fib = new FileInformationBlock(_mainDocument);
103:
104: initTableStream();
105: initTextPieces();
106: initFormattingProperties();
107:
108: }
109:
110: /** Creates a new instance of HDFObjectFactory
111: *
112: * @param istream The InputStream that is the Word document
113: *
114: */
115: public HDFObjectFactory(InputStream istream) throws IOException {
116: this (istream, null);
117: }
118:
119: public static List getTypes(InputStream istream) throws IOException {
120: List results = new ArrayList(1);
121:
122: //do Ole stuff
123: POIFSFileSystem filesystem = new POIFSFileSystem(istream);
124:
125: DocumentEntry headerProps = (DocumentEntry) filesystem
126: .getRoot().getEntry("WordDocument");
127:
128: byte[] mainDocument = new byte[headerProps.getSize()];
129: filesystem.createDocumentInputStream("WordDocument").read(
130: mainDocument);
131:
132: FileInformationBlock fib = new FileInformationBlock(
133: mainDocument);
134:
135: results.add(fib);
136: return results;
137: }
138:
139: /**
140: * Initializes the table stream
141: *
142: * @throws IOException
143: */
144: private void initTableStream() throws IOException {
145: String tablename = null;
146: if (_fib.isFWhichTblStm()) {
147: tablename = "1Table";
148: } else {
149: tablename = "0Table";
150: }
151:
152: DocumentEntry tableEntry = (DocumentEntry) _filesystem
153: .getRoot().getEntry(tablename);
154:
155: //load the table stream into a buffer
156: int size = tableEntry.getSize();
157: _tableBuffer = new byte[size];
158: _filesystem.createDocumentInputStream(tablename).read(
159: _tableBuffer);
160: }
161:
162: /**
163: * Initializes the text pieces. Text is divided into pieces because some
164: * "pieces" may only contain unicode characters.
165: *
166: * @throws IOException
167: */
168: private void initTextPieces() throws IOException {
169: int pos = _fib.getFcClx();
170:
171: //skips through the prms before we reach the piece table. These contain data
172: //for actual fast saved files
173: while (_tableBuffer[pos] == 1) {
174: pos++;
175: int skip = LittleEndian.getShort(_tableBuffer, pos);
176: pos += 2 + skip;
177: }
178: if (_tableBuffer[pos] != 2) {
179: throw new IOException("The text piece table is corrupted");
180: } else {
181: //parse out the text pieces
182: int pieceTableSize = LittleEndian.getInt(_tableBuffer,
183: ++pos);
184: pos += 4;
185: int pieces = (pieceTableSize - 4) / 12;
186: for (int x = 0; x < pieces; x++) {
187: int filePos = LittleEndian.getInt(_tableBuffer, pos
188: + ((pieces + 1) * 4) + (x * 8) + 2);
189: boolean unicode = false;
190: if ((filePos & 0x40000000) == 0) {
191: unicode = true;
192: } else {
193: unicode = false;
194: filePos &= ~(0x40000000);//gives me FC in doc stream
195: filePos /= 2;
196: }
197: int totLength = LittleEndian.getInt(_tableBuffer, pos
198: + (x + 1) * 4)
199: - LittleEndian.getInt(_tableBuffer, pos
200: + (x * 4));
201:
202: TextPiece piece = new TextPiece(filePos, totLength,
203: unicode);
204: _listener.text(piece);
205:
206: }
207:
208: }
209:
210: }
211:
212: /**
213: * initializes all of the formatting properties for a Word Document
214: */
215: private void initFormattingProperties() {
216: createStyleSheet();
217: createListTables();
218: createFontTable();
219:
220: initDocumentProperties();
221: initSectionProperties();
222: //initCharacterProperties();
223: //initParagraphProperties();
224: }
225:
226: private void initCharacterProperties(int charOffset,
227: PlexOfCps charPlcf, int start, int end) {
228: //Initialize paragraph property stuff
229: //int currentCharPage = _charParsingState.getCurrentPage();
230: int charPlcfLen = charPlcf.length();
231: int currentPageIndex = _charParsingState.getCurrentPageIndex();
232: FormattedDiskPage fkp = _charParsingState.getFkp();
233: int currentChpxIndex = _charParsingState.getCurrentPropIndex();
234: int currentArraySize = fkp.size();
235:
236: //get the character runs for this paragraph
237: int charStart = 0;
238: int charEnd = 0;
239: //add the character runs
240: do {
241: if (currentChpxIndex < currentArraySize) {
242: charStart = fkp.getStart(currentChpxIndex);
243: charEnd = fkp.getEnd(currentChpxIndex);
244: byte[] chpx = fkp.getGrpprl(currentChpxIndex);
245: _listener.characterRun(new ChpxNode(Math.max(charStart,
246: start), Math.min(charEnd, end), chpx));
247:
248: if (charEnd < end) {
249: currentChpxIndex++;
250: } else {
251: _charParsingState.setState(currentPageIndex, fkp,
252: currentChpxIndex);
253: break;
254: }
255: } else {
256: int currentCharPage = LittleEndian
257: .getInt(
258: _tableBuffer,
259: charOffset
260: + charPlcf
261: .getStructOffset(++currentPageIndex));
262: byte[] byteFkp = new byte[512];
263: System.arraycopy(_mainDocument,
264: (currentCharPage * 512), byteFkp, 0, 512);
265: fkp = new CHPFormattedDiskPage(byteFkp);
266: currentChpxIndex = 0;
267: currentArraySize = fkp.size();
268: }
269: } while (currentPageIndex < charPlcfLen);
270: }
271:
272: private void initParagraphProperties(int parOffset,
273: PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf,
274: int start, int end) {
275: //Initialize paragraph property stuff
276: //int currentParPage = _parParsingState.getCurrentPage();
277: int parPlcfLen = parPlcf.length();
278: int currentPageIndex = _parParsingState.getCurrentPageIndex();
279: FormattedDiskPage fkp = _parParsingState.getFkp();
280: int currentPapxIndex = _parParsingState.getCurrentPropIndex();
281: int currentArraySize = fkp.size();
282:
283: do {
284: if (currentPapxIndex < currentArraySize) {
285: int parStart = fkp.getStart(currentPapxIndex);
286: int parEnd = fkp.getEnd(currentPapxIndex);
287: byte[] papx = fkp.getGrpprl(currentPapxIndex);
288: _listener.paragraph(new PapxNode(Math.max(parStart,
289: start), Math.min(parEnd, end), papx));
290: initCharacterProperties(charOffset, charPlcf, Math.max(
291: start, parStart), Math.min(parEnd, end));
292: if (parEnd < end) {
293: currentPapxIndex++;
294: } else {
295: //save the state
296: _parParsingState.setState(currentPageIndex, fkp,
297: currentPapxIndex);
298: break;
299: }
300: } else {
301: int currentParPage = LittleEndian
302: .getInt(
303: _tableBuffer,
304: parOffset
305: + parPlcf
306: .getStructOffset(++currentPageIndex));
307: byte byteFkp[] = new byte[512];
308: System.arraycopy(_mainDocument, (currentParPage * 512),
309: byteFkp, 0, 512);
310: fkp = new PAPFormattedDiskPage(byteFkp);
311: currentPapxIndex = 0;
312: currentArraySize = fkp.size();
313: }
314: } while (currentPageIndex < parPlcfLen);
315: }
316:
317: /**
318: * initializes the CharacterProperties BTree
319: */
320: /*private void initCharacterProperties()
321: {
322: int charOffset = _fib.getFcPlcfbteChpx();
323: int charPlcSize = _fib.getLcbPlcfbteChpx();
324:
325: //int arraySize = (charPlcSize - 4)/8;
326:
327: //first we must go through the bin table and find the fkps
328: for(int x = 0; x < arraySize; x++)
329: {
330:
331: //get page number(has nothing to do with document page)
332: //containing the chpx for the paragraph
333: int PN = LittleEndian.getInt(_tableBuffer, charOffset + (4 * (arraySize + 1) + (4 * x)));
334:
335: byte[] fkp = new byte[512];
336: System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
337: //take each fkp and get the chpxs
338: int crun = LittleEndian.getUnsignedByte(fkp, 511);
339: for(int y = 0; y < crun; y++)
340: {
341: //get the beginning fc of each paragraph text run
342: int fcStart = LittleEndian.getInt(fkp, y * 4);
343: int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);
344: //get the offset in fkp of the papx for this paragraph
345: int chpxOffset = 2 * LittleEndian.getUnsignedByte(fkp, ((crun + 1) * 4) + y);
346:
347: //optimization if offset == 0 use "Normal" style
348: if(chpxOffset == 0)
349:
350: {
351: _characterRuns.add(new ChpxNode(fcStart, fcEnd, new byte[0]));
352: continue;
353: }
354:
355: int size = LittleEndian.getUnsignedByte(fkp, chpxOffset);
356:
357: byte[] chpx = new byte[size];
358: System.arraycopy(fkp, ++chpxOffset, chpx, 0, size);
359: //_papTable.put(new Integer(fcStart), papx);
360: _characterRuns.add(new ChpxNode(fcStart, fcEnd, chpx));
361: }
362:
363: }
364: }*/
365: /**
366: * intializes the Paragraph Properties BTree
367: */
368: private void initParagraphProperties() {
369: //paragraphs
370: int parOffset = _fib.getFcPlcfbtePapx();
371: int parPlcSize = _fib.getLcbPlcfbtePapx();
372:
373: //characters
374: int charOffset = _fib.getFcPlcfbteChpx();
375: int charPlcSize = _fib.getLcbPlcfbteChpx();
376:
377: PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
378: PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
379:
380: //Initialize character property stuff
381: int currentCharPage = LittleEndian.getInt(_tableBuffer,
382: charOffset + charPlcf.getStructOffset(0));
383: int charPlcfLen = charPlcf.length();
384: int currentPageIndex = 0;
385: byte[] fkp = new byte[512];
386: System.arraycopy(_mainDocument, (currentCharPage * 512), fkp,
387: 0, 512);
388: CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
389: int currentChpxIndex = 0;
390: int currentArraySize = cfkp.size();
391:
392: int arraySize = parPlcf.length();
393:
394: //first we must go through the bin table and find the fkps
395: for (int x = 0; x < arraySize; x++) {
396: int PN = LittleEndian.getInt(_tableBuffer, parOffset
397: + parPlcf.getStructOffset(x));
398:
399: fkp = new byte[512];
400: System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
401:
402: PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
403: //take each fkp and get the paps
404: int crun = pfkp.size();
405: for (int y = 0; y < crun; y++) {
406: //get the beginning fc of each paragraph text run
407: int fcStart = pfkp.getStart(y);
408: int fcEnd = pfkp.getEnd(y);
409:
410: //get the papx for this paragraph
411: byte[] papx = pfkp.getGrpprl(y);
412:
413: _listener.paragraph(new PapxNode(fcStart, fcEnd, papx));
414:
415: //get the character runs for this paragraph
416: int charStart = 0;
417: int charEnd = 0;
418: //add the character runs
419: do {
420: if (currentChpxIndex < currentArraySize) {
421: charStart = cfkp.getStart(currentChpxIndex);
422: charEnd = cfkp.getEnd(currentChpxIndex);
423: byte[] chpx = cfkp.getGrpprl(currentChpxIndex);
424: _listener.characterRun(new ChpxNode(charStart,
425: charEnd, chpx));
426: if (charEnd < fcEnd) {
427: currentChpxIndex++;
428: } else {
429: break;
430: }
431: } else {
432: currentCharPage = LittleEndian
433: .getInt(
434: _tableBuffer,
435: charOffset
436: + charPlcf
437: .getStructOffset(++currentPageIndex));
438: fkp = new byte[512];
439: System.arraycopy(_mainDocument,
440: (currentCharPage * 512), fkp, 0, 512);
441: cfkp = new CHPFormattedDiskPage(fkp);
442: currentChpxIndex = 0;
443: currentArraySize = cfkp.size();
444: }
445: } while (currentCharPage <= charPlcfLen + 1);
446:
447: }
448:
449: }
450:
451: }
452:
453: private void initParsingStates(int parOffset, PlexOfCps parPlcf,
454: int charOffset, PlexOfCps charPlcf) {
455: int currentCharPage = LittleEndian.getInt(_tableBuffer,
456: charOffset + charPlcf.getStructOffset(0));
457: byte[] fkp = new byte[512];
458: System.arraycopy(_mainDocument, (currentCharPage * 512), fkp,
459: 0, 512);
460: CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
461: _charParsingState = new ParsingState(currentCharPage, cfkp);
462:
463: int currentParPage = LittleEndian.getInt(_tableBuffer,
464: parOffset + parPlcf.getStructOffset(0));
465: fkp = new byte[512];
466: System.arraycopy(_mainDocument, (currentParPage * 512), fkp, 0,
467: 512);
468: PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
469: _parParsingState = new ParsingState(currentParPage, pfkp);
470: }
471:
472: /**
473: * initializes the SectionProperties BTree
474: */
475: private void initSectionProperties() {
476:
477: int ccpText = _fib.getCcpText();
478: int ccpFtn = _fib.getCcpFtn();
479:
480: //sections
481: int fcMin = _fib.getFcMin();
482: int plcfsedFC = _fib.getFcPlcfsed();
483: int plcfsedSize = _fib.getLcbPlcfsed();
484:
485: //paragraphs
486: int parOffset = _fib.getFcPlcfbtePapx();
487: int parPlcSize = _fib.getLcbPlcfbtePapx();
488:
489: //characters
490: int charOffset = _fib.getFcPlcfbteChpx();
491: int charPlcSize = _fib.getLcbPlcfbteChpx();
492:
493: PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
494: PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
495:
496: initParsingStates(parOffset, parPlcf, charOffset, charPlcf);
497:
498: //byte[] plcfsed = new byte[plcfsedSize];
499: //System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize);
500:
501: PlexOfCps plcfsed = new PlexOfCps(plcfsedSize, 12);
502: int arraySize = plcfsed.length();
503:
504: int start = fcMin;
505: int end = fcMin + ccpText;
506: int x = 0;
507: int sectionEnd = 0;
508:
509: //do the main body sections
510: while (x < arraySize) {
511: int sectionStart = LittleEndian.getInt(_tableBuffer,
512: plcfsedFC + plcfsed.getIntOffset(x))
513: + fcMin;
514: sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC
515: + plcfsed.getIntOffset(x + 1))
516: + fcMin;
517: int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC
518: + plcfsed.getStructOffset(x) + 2);
519: int sepxSize = LittleEndian.getShort(_mainDocument,
520: sepxStart);
521:
522: byte[] sepx = new byte[sepxSize];
523: System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0,
524: sepxSize);
525: SepxNode node = new SepxNode(x + 1, sectionStart,
526: sectionEnd, sepx);
527: _listener.bodySection(node);
528: initParagraphProperties(parOffset, parPlcf, charOffset,
529: charPlcf, sectionStart, Math.min(end, sectionEnd));
530:
531: if (sectionEnd > end) {
532: break;
533: } else {
534: x++;
535: }
536: }
537: //do the header sections
538: for (; x < arraySize; x++)// && sectionEnd <= end; x++)
539: {
540: int sectionStart = LittleEndian.getInt(_tableBuffer,
541: plcfsedFC + plcfsed.getIntOffset(x))
542: + fcMin;
543: sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC
544: + plcfsed.getIntOffset(x + 1))
545: + fcMin;
546: int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC
547: + plcfsed.getStructOffset(x) + 2);
548: int sepxSize = LittleEndian.getShort(_mainDocument,
549: sepxStart);
550:
551: byte[] sepx = new byte[sepxSize];
552: System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0,
553: sepxSize);
554: SepxNode node = new SepxNode(x + 1, sectionStart,
555: sectionEnd, sepx);
556: _listener.hdrSection(node);
557: initParagraphProperties(parOffset, parPlcf, charOffset,
558: charPlcf, Math.max(sectionStart, end), sectionEnd);
559:
560: }
561: _listener.endSections();
562: }
563:
564: /**
565: * Initializes the DocumentProperties object unique to this document.
566: */
567: private void initDocumentProperties() {
568: int pos = _fib.getFcDop();
569: int size = _fib.getLcbDop();
570: byte[] dopArray = new byte[size];
571:
572: System.arraycopy(_tableBuffer, pos, dopArray, 0, size);
573: _listener.document(new DocumentProperties(dopArray));
574: }
575:
576: /**
577: * Uncompresses the StyleSheet from file into memory.
578: */
579: private void createStyleSheet() {
580: int stshIndex = _fib.getFcStshf();
581: int stshSize = _fib.getLcbStshf();
582: byte[] stsh = new byte[stshSize];
583: System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize);
584:
585: _listener.styleSheet(new StyleSheet(stsh));
586: }
587:
588: /**
589: * Initializes the list tables for this document
590: */
591: private void createListTables() {
592: int lfoOffset = _fib.getFcPlfLfo();
593: int lfoSize = _fib.getLcbPlfLfo();
594: byte[] plflfo = new byte[lfoSize];
595:
596: System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize);
597:
598: int lstOffset = _fib.getFcPlcfLst();
599: int lstSize = _fib.getLcbPlcfLst();
600: if (lstOffset > 0 && lstSize > 0) {
601: // The lstSize returned by _fib.getLcbPlcfLst() doesn't appear
602: // to take into account any LVLs. Therefore, we recalculate
603: // lstSize based on where the LFO section begins (because the
604: // LFO section immediately follows the LST section).
605: lstSize = lfoOffset - lstOffset;
606: byte[] plcflst = new byte[lstSize];
607: System.arraycopy(_tableBuffer, lstOffset, plcflst, 0,
608: lstSize);
609: _listener.lists(new ListTables(plcflst, plflfo));
610: }
611: }
612:
613: /**
614: * Initializes this document's FontTable;
615: */
616: private void createFontTable() {
617: int fontTableIndex = _fib.getFcSttbfffn();
618: int fontTableSize = _fib.getLcbSttbfffn();
619: byte[] fontTable = new byte[fontTableSize];
620: System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0,
621: fontTableSize);
622: _listener.fonts(new FontTable(fontTable));
623: }
624:
625: }
|