001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017:
018: package org.apache.poi.hslf.extractor;
019:
020: import java.io.*;
021: import java.util.HashSet;
022:
023: import org.apache.poi.POITextExtractor;
024: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
025: import org.apache.poi.hslf.*;
026: import org.apache.poi.hslf.model.*;
027: import org.apache.poi.hslf.usermodel.*;
028:
029: /**
030: * This class can be used to extract text from a PowerPoint file.
031: * Can optionally also get the notes from one.
032: *
033: * @author Nick Burch
034: */
035:
036: public class PowerPointExtractor extends POITextExtractor {
037: private HSLFSlideShow _hslfshow;
038: private SlideShow _show;
039: private Slide[] _slides;
040: private Notes[] _notes;
041:
042: /**
043: * Basic extractor. Returns all the text, and optionally all the notes
044: */
045: public static void main(String args[]) throws IOException {
046: if (args.length < 1) {
047: System.err.println("Useage:");
048: System.err.println("\tPowerPointExtractor [-notes] <file>");
049: System.exit(1);
050: }
051:
052: boolean notes = false;
053: String file;
054: if (args.length > 1) {
055: notes = true;
056: file = args[1];
057: } else {
058: file = args[0];
059: }
060:
061: PowerPointExtractor ppe = new PowerPointExtractor(file);
062: System.out.println(ppe.getText(true, notes));
063: ppe.close();
064: }
065:
066: /**
067: * Creates a PowerPointExtractor, from a file
068: * @param fileName The name of the file to extract from
069: */
070: public PowerPointExtractor(String fileName) throws IOException {
071: this (new FileInputStream(fileName));
072: }
073:
074: /**
075: * Creates a PowerPointExtractor, from an Input Stream
076: * @param iStream The input stream containing the PowerPoint document
077: */
078: public PowerPointExtractor(InputStream iStream) throws IOException {
079: this (new POIFSFileSystem(iStream));
080: }
081:
082: /**
083: * Creates a PowerPointExtractor, from an open POIFSFileSystem
084: * @param fs the POIFSFileSystem containing the PowerPoint document
085: */
086: public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
087: this (new HSLFSlideShow(fs));
088: }
089:
090: /**
091: * Creates a PowerPointExtractor, from a HSLFSlideShow
092: * @param ss the HSLFSlideShow to extract text from
093: */
094: public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
095: super (ss);
096: _hslfshow = ss;
097: _show = new SlideShow(_hslfshow);
098: _slides = _show.getSlides();
099: _notes = _show.getNotes();
100: }
101:
102: /**
103: * Shuts down the underlying streams
104: */
105: public void close() throws IOException {
106: _hslfshow.close();
107: _hslfshow = null;
108: _show = null;
109: _slides = null;
110: _notes = null;
111: }
112:
113: /**
114: * Fetches all the slide text from the slideshow, but not the notes
115: */
116: public String getText() {
117: return getText(true, false);
118: }
119:
120: /**
121: * Fetches all the notes text from the slideshow, but not the slide text
122: */
123: public String getNotes() {
124: return getText(false, true);
125: }
126:
127: /**
128: * Fetches text from the slideshow, be it slide text or note text.
129: * Because the final block of text in a TextRun normally have their
130: * last \n stripped, we add it back
131: * @param getSlideText fetch slide text
132: * @param getNoteText fetch note text
133: */
134: public String getText(boolean getSlideText, boolean getNoteText) {
135: StringBuffer ret = new StringBuffer();
136:
137: if (getSlideText) {
138: for (int i = 0; i < _slides.length; i++) {
139: Slide slide = _slides[i];
140: TextRun[] runs = slide.getTextRuns();
141: for (int j = 0; j < runs.length; j++) {
142: TextRun run = runs[j];
143: if (run != null) {
144: String text = run.getText();
145: ret.append(text);
146: if (!text.endsWith("\n")) {
147: ret.append("\n");
148: }
149: }
150: }
151: }
152: if (getNoteText) {
153: ret.append(" ");
154: }
155: }
156:
157: if (getNoteText) {
158: // Not currently using _notes, as that can have the notes of
159: // master sheets in. Grab Slide list, then work from there,
160: // but ensure no duplicates
161: HashSet seenNotes = new HashSet();
162: for (int i = 0; i < _slides.length; i++) {
163: Notes notes = _slides[i].getNotesSheet();
164: if (notes == null) {
165: continue;
166: }
167: Integer id = new Integer(notes._getSheetNumber());
168: if (seenNotes.contains(id)) {
169: continue;
170: }
171: seenNotes.add(id);
172:
173: TextRun[] runs = notes.getTextRuns();
174: if (runs != null && runs.length > 0) {
175: for (int j = 0; j < runs.length; j++) {
176: TextRun run = runs[j];
177: String text = run.getText();
178: ret.append(text);
179: if (!text.endsWith("\n")) {
180: ret.append("\n");
181: }
182: }
183: }
184: }
185: }
186:
187: return ret.toString();
188: }
189: }
|