001: /**
002: * Copyright (c) 2003-2006, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox;
031:
032: import java.io.File;
033: import java.io.FileOutputStream;
034: import java.io.IOException;
035: import java.io.OutputStreamWriter;
036: import java.io.Writer;
037: import java.net.MalformedURLException;
038: import java.net.URL;
039:
040: import org.pdfbox.pdmodel.PDDocument;
041: import org.pdfbox.pdmodel.encryption.AccessPermission;
042: import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
043: import org.pdfbox.util.PDFText2HTML;
044: import org.pdfbox.util.PDFTextStripper;
045:
046: /**
047: * This is the main program that simply parses the pdf document and transforms it
048: * into text.
049: *
050: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
051: * @version $Revision: 1.14 $
052: */
053: public class ExtractText {
054: /**
055: * This is the default encoding of the text to be output.
056: */
057: public static final String DEFAULT_ENCODING = null;
058: //"ISO-8859-1";
059: //"ISO-8859-6"; //arabic
060: //"US-ASCII";
061: //"UTF-8";
062: //"UTF-16";
063: //"UTF-16BE";
064: //"UTF-16LE";
065:
066: private static final String PASSWORD = "-password";
067: private static final String ENCODING = "-encoding";
068: private static final String CONSOLE = "-console";
069: private static final String START_PAGE = "-startPage";
070: private static final String END_PAGE = "-endPage";
071: private static final String SORT = "-sort";
072: private static final String HTML = "-html"; // jjb - added simple HTML output
073:
074: /**
075: * private constructor.
076: */
077: private ExtractText() {
078: //static class
079: }
080:
081: /**
082: * Infamous main method.
083: *
084: * @param args Command line arguments, should be one and a reference to a file.
085: *
086: * @throws Exception If there is an error parsing the document.
087: */
088: public static void main(String[] args) throws Exception {
089: boolean toConsole = false;
090: boolean toHTML = false;
091: boolean sort = false;
092: String password = "";
093: String encoding = DEFAULT_ENCODING;
094: String pdfFile = null;
095: String textFile = null;
096: int startPage = 1;
097: int endPage = Integer.MAX_VALUE;
098: for (int i = 0; i < args.length; i++) {
099: if (args[i].equals(PASSWORD)) {
100: i++;
101: if (i >= args.length) {
102: usage();
103: }
104: password = args[i];
105: } else if (args[i].equals(ENCODING)) {
106: i++;
107: if (i >= args.length) {
108: usage();
109: }
110: encoding = args[i];
111: } else if (args[i].equals(START_PAGE)) {
112: i++;
113: if (i >= args.length) {
114: usage();
115: }
116: startPage = Integer.parseInt(args[i]);
117: } else if (args[i].equals(HTML)) {
118: toHTML = true;
119: } else if (args[i].equals(SORT)) {
120: sort = true;
121: } else if (args[i].equals(END_PAGE)) {
122: i++;
123: if (i >= args.length) {
124: usage();
125: }
126: endPage = Integer.parseInt(args[i]);
127: } else if (args[i].equals(CONSOLE)) {
128: toConsole = true;
129: } else {
130: if (pdfFile == null) {
131: pdfFile = args[i];
132: } else {
133: textFile = args[i];
134: }
135: }
136: }
137:
138: if (pdfFile == null) {
139: usage();
140: } else {
141:
142: Writer output = null;
143: PDDocument document = null;
144: try {
145: try {
146: //basically try to load it from a url first and if the URL
147: //is not recognized then try to load it from the file system.
148: URL url = new URL(pdfFile);
149: document = PDDocument.load(url);
150: String fileName = url.getFile();
151: if (textFile == null && fileName.length() > 4) {
152: File outputFile = new File(fileName.substring(
153: 0, fileName.length() - 4)
154: + ".txt");
155: textFile = outputFile.getName();
156: }
157: } catch (MalformedURLException e) {
158: document = PDDocument.load(pdfFile);
159: if (textFile == null && pdfFile.length() > 4) {
160: textFile = pdfFile.substring(0, pdfFile
161: .length() - 4)
162: + ".txt";
163: }
164: }
165:
166: //document.print();
167: if (document.isEncrypted()) {
168: StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(
169: password);
170: document.openProtection(sdm);
171: AccessPermission ap = document
172: .getCurrentAccessPermission();
173:
174: if (!ap.canExtractContent()) {
175: throw new IOException(
176: "You do not have permission to extract text");
177: }
178: }
179: if (toConsole) {
180: output = new OutputStreamWriter(System.out);
181: } else {
182: if (encoding != null) {
183: output = new OutputStreamWriter(
184: new FileOutputStream(textFile),
185: encoding);
186: } else {
187: //use default encoding
188: output = new OutputStreamWriter(
189: new FileOutputStream(textFile));
190: }
191: }
192:
193: PDFTextStripper stripper = null;
194: if (toHTML) {
195: stripper = new PDFText2HTML();
196: } else {
197: stripper = new PDFTextStripper();
198: }
199: stripper.setSortByPosition(sort);
200: stripper.setStartPage(startPage);
201: stripper.setEndPage(endPage);
202: stripper.writeText(document, output);
203: } finally {
204: if (output != null) {
205: output.close();
206: }
207: if (document != null) {
208: document.close();
209: }
210: }
211: }
212: }
213:
214: /**
215: * This will print the usage requirements and exit.
216: */
217: private static void usage() {
218: System.err
219: .println("Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n"
220: + " -password <password> Password to decrypt document\n"
221: + " -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n"
222: + " -console Send text to console instead of file\n"
223: + " -html Output in HTML format instead of raw text\n"
224: + " -sort Sort the text before writing\n"
225: + " -startPage <number> The first page to start extraction(1 based)\n"
226: + " -endPage <number> The last page to extract(inclusive)\n"
227: + " <PDF file> The PDF document to use\n"
228: + " [Text File] The file to write the text to\n");
229: System.exit(1);
230: }
231: }
|