001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017:
018: package org.apache.poi.hslf.extractor;
019:
020: import java.io.*;
021: import java.util.Vector;
022:
023: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
024: import org.apache.poi.poifs.filesystem.POIFSDocument;
025: import org.apache.poi.poifs.filesystem.DocumentEntry;
026: import org.apache.poi.poifs.filesystem.DocumentInputStream;
027: import org.apache.poi.util.LittleEndian;
028:
029: import org.apache.poi.hslf.record.CString;
030: import org.apache.poi.hslf.record.Record;
031: import org.apache.poi.hslf.record.RecordTypes;
032: import org.apache.poi.hslf.record.StyleTextPropAtom;
033: import org.apache.poi.hslf.record.TextHeaderAtom;
034: import org.apache.poi.hslf.record.TextBytesAtom;
035: import org.apache.poi.hslf.record.TextCharsAtom;
036: import org.apache.poi.hslf.model.TextRun;
037:
038: /**
039: * This class will get all the text from a Powerpoint Document, including
040: * all the bits you didn't want, and in a somewhat random order, but will
041: * do it very fast.
042: * The class ignores most of the hslf classes, and doesn't use
043: * HSLFSlideShow. Instead, it just does a very basic scan through the
044: * file, grabbing all the text records as it goes. It then returns the
045: * text, either as a single string, or as a vector of all the individual
046: * strings.
047: * Because of how it works, it will return a lot of "crud" text that you
048: * probably didn't want! It will return text from master slides. It will
049: * return duplicate text, and some mangled text (powerpoint files often
050: * have duplicate copies of slide text in them). You don't get any idea
051: * what the text was associated with.
052: * Almost everyone will want to use @see PowerPointExtractor instead. There
053: * are only a very small number of cases (eg some performance sensitive
054: * lucene indexers) that would ever want to use this!
055: *
056: * @author Nick Burch
057: */
058:
059: public class QuickButCruddyTextExtractor {
060: private POIFSFileSystem fs;
061: private InputStream is;
062: private byte[] pptContents;
063:
064: /**
065: * Really basic text extractor, that will also return lots of crud text.
066: * Takes a single argument, the file to extract from
067: */
068: public static void main(String args[]) throws IOException {
069: if (args.length < 1) {
070: System.err.println("Useage:");
071: System.err.println("\tQuickButCruddyTextExtractor <file>");
072: System.exit(1);
073: }
074:
075: String file = args[0];
076:
077: QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(
078: file);
079: System.out.println(ppe.getTextAsString());
080: ppe.close();
081: }
082:
083: /**
084: * Creates an extractor from a given file name
085: * @param fileName
086: */
087: public QuickButCruddyTextExtractor(String fileName)
088: throws IOException {
089: this (new FileInputStream(fileName));
090: }
091:
092: /**
093: * Creates an extractor from a given input stream
094: * @param iStream
095: */
096: public QuickButCruddyTextExtractor(InputStream iStream)
097: throws IOException {
098: this (new POIFSFileSystem(iStream));
099: is = iStream;
100: }
101:
102: /**
103: * Creates an extractor from a POIFS Filesystem
104: * @param poifs
105: */
106: public QuickButCruddyTextExtractor(POIFSFileSystem poifs)
107: throws IOException {
108: fs = poifs;
109:
110: // Find the PowerPoint bit, and get out the bytes
111: DocumentEntry docProps = (DocumentEntry) fs.getRoot().getEntry(
112: "PowerPoint Document");
113: pptContents = new byte[docProps.getSize()];
114: fs.createDocumentInputStream("PowerPoint Document").read(
115: pptContents);
116: }
117:
118: /**
119: * Shuts down the underlying streams
120: */
121: public void close() throws IOException {
122: if (is != null) {
123: is.close();
124: }
125: fs = null;
126: }
127:
128: /**
129: * Fetches the ALL the text of the powerpoint file, as a single string
130: */
131: public String getTextAsString() {
132: StringBuffer ret = new StringBuffer();
133: Vector textV = getTextAsVector();
134: for (int i = 0; i < textV.size(); i++) {
135: String text = (String) textV.get(i);
136: ret.append(text);
137: if (!text.endsWith("\n")) {
138: ret.append('\n');
139: }
140: }
141: return ret.toString();
142: }
143:
144: /**
145: * Fetches the ALL the text of the powerpoint file, in a vector of
146: * strings, one per text record
147: */
148: public Vector getTextAsVector() {
149: Vector textV = new Vector();
150:
151: // Set to the start of the file
152: int walkPos = 0;
153:
154: // Start walking the file, looking for the records
155: while (walkPos != -1) {
156: int newPos = findTextRecords(walkPos, textV);
157: walkPos = newPos;
158: }
159:
160: // Return what we find
161: return textV;
162: }
163:
164: /**
165: * For the given position, look if the record is a text record, and wind
166: * on after.
167: * If it is a text record, grabs out the text. Whatever happens, returns
168: * the position of the next record, or -1 if no more.
169: */
170: public int findTextRecords(int startPos, Vector textV) {
171: // Grab the length, and the first option byte
172: // Note that the length doesn't include the 8 byte atom header
173: int len = (int) LittleEndian.getUInt(pptContents, startPos + 4);
174: byte opt = pptContents[startPos];
175:
176: // If it's a container, step into it and return
177: // (If it's a container, option byte 1 BINARY_AND 0x0f will be 0x0f)
178: int container = (int) opt & 0x0f;
179: if (container == 0x0f) {
180: return (startPos + 8);
181: }
182:
183: // Otherwise, check the type to see if it's text
184: long type = LittleEndian.getUShort(pptContents, startPos + 2);
185: TextRun trun = null;
186:
187: // TextBytesAtom
188: if (type == RecordTypes.TextBytesAtom.typeID) {
189: TextBytesAtom tba = (TextBytesAtom) Record
190: .createRecordForType(type, pptContents, startPos,
191: len + 8);
192: trun = new TextRun((TextHeaderAtom) null, tba,
193: (StyleTextPropAtom) null);
194: }
195: // TextCharsAtom
196: if (type == RecordTypes.TextCharsAtom.typeID) {
197: TextCharsAtom tca = (TextCharsAtom) Record
198: .createRecordForType(type, pptContents, startPos,
199: len + 8);
200: trun = new TextRun((TextHeaderAtom) null, tca,
201: (StyleTextPropAtom) null);
202: }
203:
204: // CString (doesn't go via a TextRun)
205: if (type == RecordTypes.CString.typeID) {
206: CString cs = (CString) Record.createRecordForType(type,
207: pptContents, startPos, len + 8);
208: String text = cs.getText();
209:
210: // Ignore the ones we know to be rubbish
211: if (text.equals("___PPT10")) {
212: } else if (text.equals("Default Design")) {
213: } else {
214: textV.add(text);
215: }
216: }
217:
218: // If we found text via a TextRun, save it in the vector
219: if (trun != null) {
220: textV.add(trun.getText());
221: }
222:
223: // Wind on by the atom length, and check we're not at the end
224: int newPos = (startPos + 8 + len);
225: if (newPos > (pptContents.length - 8)) {
226: newPos = -1;
227: }
228: return newPos;
229: }
230: }
|