001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017:
018: package org.apache.poi.hslf.dev;
019:
020: import java.util.*;
021: import java.io.*;
022:
023: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
024: import org.apache.poi.poifs.filesystem.POIFSDocument;
025: import org.apache.poi.poifs.filesystem.DocumentEntry;
026:
027: import org.apache.poi.ddf.*;
028: import org.apache.poi.hslf.record.RecordTypes;
029:
030: import org.apache.poi.util.LittleEndian;
031:
032: /**
033: * This class provides a way to "peek" inside a powerpoint file. It
034: * will print out all the types it find, and for those it know aren't
035: * atoms, what they contain
036: *
037: * To figure out what things are, and if they are atoms or not, used the
038: * list in hslf.record.RecordTypes
039: *
040: * To peek inside PPDrawings, which hold Escher drawings, we use the
041: * DDF package from POI (but we can fake it by using the Escher listings
042: * from hslf.record.RecordTypes also)
043: *
044: * @author Nick Burch
045: */
046:
047: public class SlideShowDumper {
048: private InputStream istream;
049: private POIFSFileSystem filesystem;
050:
051: private byte[] _docstream;
052:
053: /** Do we try to use DDF to understand the escher objects? */
054: private boolean ddfEscher = false;
055: /** Do we use our own built-in basic escher groker to understand the escher objects? */
056: private boolean basicEscher = false;
057:
058: /**
059: * right now this function takes one parameter: a ppt file, and outputs
060: * a dump of what it contains
061: */
062: public static void main(String args[]) throws IOException {
063: if (args.length == 0) {
064: System.err
065: .println("Useage: SlideShowDumper [-escher|-basicescher] <filename>");
066: return;
067: }
068:
069: String filename = args[0];
070: if (args.length > 1) {
071: filename = args[1];
072: }
073:
074: SlideShowDumper foo = new SlideShowDumper(filename);
075:
076: if (args.length > 1) {
077: if (args[0].equalsIgnoreCase("-escher")) {
078: foo.setDDFEscher(true);
079: } else {
080: foo.setBasicEscher(true);
081: }
082: }
083:
084: foo.printDump();
085: foo.close();
086: }
087:
088: /**
089: * Constructs a Powerpoint dump from fileName. Parses the document
090: * and dumps out the contents
091: *
092: * @param fileName The name of the file to read.
093: * @throws IOException if there is a problem while parsing the document.
094: */
095: public SlideShowDumper(String fileName) throws IOException {
096: this (new FileInputStream(fileName));
097: }
098:
099: /**
100: * Constructs a Powerpoint dump from an input stream. Parses the
101: * document and dumps out the contents
102: *
103: * @param inputStream the source of the data
104: * @throws IOException if there is a problem while parsing the document.
105: */
106: public SlideShowDumper(InputStream inputStream) throws IOException {
107: //do Ole stuff
108: this (new POIFSFileSystem(inputStream));
109: istream = inputStream;
110: }
111:
112: /**
113: * Constructs a Powerpoint dump from a POIFS Filesystem. Parses the
114: * document and dumps out the contents
115: *
116: * @param filesystem the POIFS FileSystem to read from
117: * @throws IOException if there is a problem while parsing the document.
118: */
119: public SlideShowDumper(POIFSFileSystem filesystem)
120: throws IOException {
121: this .filesystem = filesystem;
122:
123: // Get the main document stream
124: DocumentEntry docProps = (DocumentEntry) filesystem.getRoot()
125: .getEntry("PowerPoint Document");
126:
127: // Grab the document stream
128: _docstream = new byte[docProps.getSize()];
129: filesystem.createDocumentInputStream("PowerPoint Document")
130: .read(_docstream);
131: }
132:
133: /**
134: * Control dumping of any Escher records found - should DDF be used?
135: */
136: public void setDDFEscher(boolean grok) {
137: ddfEscher = grok;
138: basicEscher = !(grok);
139: }
140:
141: /**
142: * Control dumping of any Escher records found - should our built in
143: * basic groker be used?
144: */
145: public void setBasicEscher(boolean grok) {
146: basicEscher = grok;
147: ddfEscher = !(grok);
148: }
149:
150: /**
151: * Shuts things down. Closes underlying streams etc
152: *
153: * @throws IOException
154: */
155: public void close() throws IOException {
156: if (istream != null) {
157: istream.close();
158: }
159: filesystem = null;
160: }
161:
162: public void printDump() {
163: // The format of records in a powerpoint file are:
164: // <little endian 2 byte "info">
165: // <little endian 2 byte "type">
166: // <little endian 4 byte "length">
167: // If it has a zero length, following it will be another record
168: // <xx xx yy yy 00 00 00 00> <xx xx yy yy zz zz zz zz>
169: // If it has a length, depending on its type it may have children or data
170: // If it has children, these will follow straight away
171: // <xx xx yy yy zz zz zz zz <xx xx yy yy zz zz zz zz>>
172: // If it has data, this will come straigh after, and run for the length
173: // <xx xx yy yy zz zz zz zz dd dd dd dd dd dd dd>
174: // All lengths given exclude the 8 byte record header
175: // (Data records are known as Atoms)
176:
177: // Document should start with:
178: // 0F 00 E8 03 ## ## ## ##
179: // (type 1000 = document, info 00 0f is normal, rest is document length)
180: // 01 00 E9 03 28 00 00 00
181: // (type 1001 = document atom, info 00 01 normal, 28 bytes long)
182:
183: // When parsing a document, look to see if you know about that type
184: // of the current record. If you know it's a type that has children,
185: // process the record's data area looking for more records
186: // If you know about the type and it doesn't have children, either do
187: // something with the data (eg TextRun) or skip over it
188: // Otherwise, check the first byte. If you do a BINARY_AND on it with
189: // 0x0f (15) and get back 0x0f, you know it has children. Otherwise
190: // it doesn't
191:
192: walkTree(0, 0, _docstream.length);
193: }
194:
195: public String makeHex(short s) {
196: String hex = Integer.toHexString((int) s).toUpperCase();
197: if (hex.length() == 1) {
198: return "0" + hex;
199: }
200: return hex;
201: }
202:
203: public String makeHex(int i) {
204: String hex = Integer.toHexString(i).toUpperCase();
205: if (hex.length() == 1) {
206: return "000" + hex;
207: }
208: if (hex.length() == 2) {
209: return "00" + hex;
210: }
211: if (hex.length() == 3) {
212: return "0" + hex;
213: }
214: return hex;
215: }
216:
217: public void walkTree(int depth, int startPos, int maxLen) {
218: int pos = startPos;
219: int endPos = startPos + maxLen;
220: int indent = depth;
221: while (pos <= endPos - 8) {
222: long type = LittleEndian.getUShort(_docstream, pos + 2);
223: long len = LittleEndian.getUInt(_docstream, pos + 4);
224: byte opt = _docstream[pos];
225:
226: String ind = "";
227: for (int i = 0; i < indent; i++) {
228: ind += " ";
229: }
230:
231: System.out.println(ind + "At position " + pos + " ("
232: + makeHex(pos) + "):");
233: System.out.println(ind + "Type is " + type + " ("
234: + makeHex((int) type) + "), len is " + len + " ("
235: + makeHex((int) len) + ")");
236:
237: // See if we know about the type of it
238: String recordName = RecordTypes.recordName((int) type);
239:
240: // Jump over header, and think about going on more
241: pos += 8;
242: if (recordName != null) {
243: System.out.println(ind + "That's a " + recordName);
244:
245: // Now check if it's a container or not
246: int container = (int) opt & 0x0f;
247:
248: // BinaryTagData seems to contain records, but it
249: // isn't tagged as doing so. Try stepping in anyway
250: if (type == 5003L && opt == 0L) {
251: container = 0x0f;
252: }
253:
254: if (type == 0L || (container != 0x0f)) {
255: System.out.println();
256: } else if (type == 1035l || type == 1036l) {
257: // Special Handling of 1035=PPDrawingGroup and 1036=PPDrawing
258: System.out.println();
259:
260: if (ddfEscher) {
261: // Seems to be:
262: walkEscherDDF((indent + 3), pos + 8,
263: (int) len - 8);
264: } else if (basicEscher) {
265: walkEscherBasic((indent + 3), pos + 8,
266: (int) len - 8);
267: }
268: } else {
269: // General container record handling code
270: System.out.println();
271: walkTree((indent + 2), pos, (int) len);
272: }
273: } else {
274: System.out.println(ind + "** unknown record **");
275: System.out.println();
276: }
277: pos += (int) len;
278: }
279: }
280:
281: /**
282: * Use the DDF code to walk the Escher records
283: */
284: public void walkEscherDDF(int indent, int pos, int len) {
285: if (len < 8) {
286: return;
287: }
288:
289: String ind = "";
290: for (int i = 0; i < indent; i++) {
291: ind += " ";
292: }
293:
294: byte[] contents = new byte[len];
295: System.arraycopy(_docstream, pos, contents, 0, len);
296: DefaultEscherRecordFactory erf = new DefaultEscherRecordFactory();
297: EscherRecord record = erf.createRecord(contents, 0);
298:
299: // For now, try filling in the fields
300: record.fillFields(contents, 0, erf);
301:
302: long atomType = LittleEndian.getUShort(contents, 2);
303: // This lacks the 8 byte header size
304: long atomLen = LittleEndian.getUShort(contents, 4);
305: // This (should) include the 8 byte header size
306: int recordLen = record.getRecordSize();
307:
308: System.out.println(ind + "At position " + pos + " ("
309: + makeHex(pos) + "):");
310: System.out.println(ind + "Type is " + atomType + " ("
311: + makeHex((int) atomType) + "), len is " + atomLen
312: + " (" + makeHex((int) atomLen) + ") (" + (atomLen + 8)
313: + ") - record claims " + recordLen);
314:
315: // Check for corrupt / lying ones
316: if (recordLen != 8 && (recordLen != (atomLen + 8))) {
317: System.out.println(ind + "** Atom length of " + atomLen
318: + " (" + (atomLen + 8)
319: + ") doesn't match record length of " + recordLen);
320: }
321:
322: // Print the record's details
323: if (record instanceof EscherContainerRecord) {
324: EscherContainerRecord ecr = (EscherContainerRecord) record;
325: System.out.println(ind + ecr.toString());
326: walkEscherDDF((indent + 3), pos + 8, (int) atomLen);
327: } else {
328: System.out.println(ind + record.toString());
329: }
330:
331: // Handle records that seem to lie
332: if (atomType == 61451l) {
333: // Normally claims a size of 8
334: recordLen = (int) atomLen + 8;
335: }
336: if (atomType == 61453l) {
337: // Returns EscherContainerRecord, but really msofbtClientTextbox
338: recordLen = (int) atomLen + 8;
339: record.fillFields(contents, 0, erf);
340: if (!(record instanceof EscherTextboxRecord)) {
341: System.out.println(ind
342: + "** Really a msofbtClientTextbox !");
343: }
344: }
345:
346: // Decide on what to do, based on how the lenghts match up
347: if (recordLen == 8 && atomLen > 8) {
348: // Assume it has children, rather than being corrupted
349: walkEscherDDF((indent + 3), pos + 8, (int) atomLen);
350:
351: // Wind on our length + our header
352: pos += atomLen;
353: pos += 8;
354: len -= atomLen;
355: len -= 8;
356: } else {
357: // No children, wind on our real length
358: pos += atomLen;
359: pos += 8;
360: len -= atomLen;
361: len -= 8;
362: }
363:
364: // Move on to the next one, if we're not at the end yet
365: if (len >= 8) {
366: walkEscherDDF(indent, pos, len);
367: }
368: }
369:
370: /**
371: * Use the basic record format groking code to walk the Escher records
372: */
373: public void walkEscherBasic(int indent, int pos, int len) {
374: if (len < 8) {
375: return;
376: }
377:
378: String ind = "";
379: for (int i = 0; i < indent; i++) {
380: ind += " ";
381: }
382:
383: long type = LittleEndian.getUShort(_docstream, pos + 2);
384: long atomlen = LittleEndian.getUInt(_docstream, pos + 4);
385: String typeS = makeHex((int) type);
386:
387: System.out.println(ind + "At position " + pos + " ("
388: + makeHex(pos) + "):");
389: System.out.println(ind + "Type is " + type + " (" + typeS
390: + "), len is " + atomlen + " ("
391: + makeHex((int) atomlen) + ")");
392:
393: String typeName = RecordTypes.recordName((int) type);
394: if (typeName != null) {
395: System.out.println(ind + "That's an Escher Record: "
396: + typeName);
397: } else {
398: System.out.println(ind + "(Unknown Escher Record)");
399: }
400:
401: // Code to print the first 8 bytes
402: // System.out.print(ind);
403: // for(int i=0; i<8; i++) {
404: // short bv = _docstream[i+pos];
405: // if(bv < 0) { bv += 256; }
406: // System.out.print(i + "=" + bv + " (" + makeHex(bv) + ") ");
407: // }
408: // System.out.println("");
409:
410: // Record specific dumps
411: if (type == 61453l) {
412: // Text Box. Print out first 8 bytes of data, then 8 4 later
413: System.out.print(ind);
414: for (int i = 8; i < 16; i++) {
415: short bv = _docstream[i + pos];
416: if (bv < 0) {
417: bv += 256;
418: }
419: System.out.print(i + "=" + bv + " (" + makeHex(bv)
420: + ") ");
421: }
422: System.out.println("");
423: System.out.print(ind);
424: for (int i = 20; i < 28; i++) {
425: short bv = _docstream[i + pos];
426: if (bv < 0) {
427: bv += 256;
428: }
429: System.out.print(i + "=" + bv + " (" + makeHex(bv)
430: + ") ");
431: }
432: System.out.println("");
433: }
434:
435: // Blank line before next entry
436: System.out.println("");
437:
438: // Look in children if we are a container
439: if (type == 61443l || type == 61444l) {
440: walkEscherBasic((indent + 3), pos + 8, (int) atomlen);
441: }
442:
443: // Keep going if not yet at end
444: if (atomlen < len) {
445: int atomleni = (int) atomlen;
446: walkEscherBasic(indent, pos + atomleni + 8, len - atomleni
447: - 8);
448: }
449: }
450: }
|