001: /**
002: * Copyright (c) 2003-2006, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.pdfparser;
031:
032: import java.io.File;
033: import java.io.InputStream;
034: import java.io.IOException;
035:
036: import java.util.Iterator;
037:
038: import org.pdfbox.cos.COSBase;
039: import org.pdfbox.cos.COSDictionary;
040: import org.pdfbox.cos.COSDocument;
041: import org.pdfbox.cos.COSObject;
042: import org.pdfbox.cos.COSStream;
043: import org.pdfbox.exceptions.WrappedIOException;
044: import org.pdfbox.io.RandomAccess;
045:
046: import org.pdfbox.pdmodel.PDDocument;
047:
048: import org.pdfbox.pdmodel.fdf.FDFDocument;
049:
050: import org.pdfbox.persistence.util.COSObjectKey;
051:
052: /**
053: * This class will handle the parsing of the PDF document.
054: *
055: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
056: * @version $Revision: 1.53 $
057: */
058: public class PDFParser extends BaseParser {
059: private static final int SPACE_BYTE = 32;
060:
061: private static final String PDF_HEADER = "%PDF-";
062: private COSDocument document;
063:
064: /**
065: * Temp file directory.
066: */
067: private File tempDirectory = null;
068:
069: private RandomAccess raf = null;
070:
071: /**
072: * Constructor.
073: *
074: * @param input The input stream that contains the PDF document.
075: *
076: * @throws IOException If there is an error initializing the stream.
077: */
078: public PDFParser(InputStream input) throws IOException {
079: this (input, null);
080: }
081:
082: /**
083: * Constructor to allow control over RandomAccessFile.
084: * @param input The input stream that contains the PDF document.
085: * @param rafi The RandomAccessFile to be used in internal COSDocument
086: *
087: * @throws IOException If there is an error initializing the stream.
088: */
089: public PDFParser(InputStream input, RandomAccess rafi)
090: throws IOException {
091: super (input);
092: this .raf = rafi;
093: }
094:
095: /**
096: * This is the directory where pdfbox will create a temporary file
097: * for storing pdf document stream in. By default this directory will
098: * be the value of the system property java.io.tmpdir.
099: *
100: * @param tmpDir The directory to create scratch files needed to store
101: * pdf document streams.
102: */
103: public void setTempDirectory(File tmpDir) {
104: tempDirectory = tmpDir;
105: }
106:
107: /**
108: * This will prase the stream and create the PDF document. This will close
109: * the stream when it is done parsing.
110: *
111: * @throws IOException If there is an error reading from the stream.
112: */
113: public void parse() throws IOException {
114: try {
115: if (raf == null) {
116: if (tempDirectory != null) {
117: document = new COSDocument(tempDirectory);
118: } else {
119: document = new COSDocument();
120: }
121: } else {
122: document = new COSDocument(raf);
123: }
124: setDocument(document);
125: String header = readLine();
126: document.setHeaderString(header);
127:
128: if (header.length() < PDF_HEADER.length() + 1) {
129: throw new IOException("Error: Header is corrupt '"
130: + header + "'");
131: }
132:
133: //sometimes there are some garbage bytes in the header before the header
134: //actually starts, so lets try to find the header first.
135: int headerStart = header.indexOf(PDF_HEADER);
136:
137: //greater than zero because if it is zero then
138: //there is no point of trimming
139: if (headerStart > 0) {
140: //trim off any leading characters
141: header = header.substring(headerStart, header.length());
142: }
143:
144: try {
145: float pdfVersion = Float.parseFloat(header.substring(
146: PDF_HEADER.length(), Math.min(header.length(),
147: PDF_HEADER.length() + 3)));
148: document.setVersion(pdfVersion);
149: } catch (NumberFormatException e) {
150: throw new IOException("Error getting pdf version:" + e);
151: }
152:
153: skipHeaderFillBytes();
154:
155: Object nextObject;
156: boolean wasLastParsedObjectAnXref = false;
157: try {
158: while ((nextObject = parseObject()) != null) {
159: if (nextObject instanceof PDFXref) {
160: PDFXref xref = (PDFXref) nextObject;
161: addXref(xref);
162: wasLastParsedObjectAnXref = true;
163: } else {
164: wasLastParsedObjectAnXref = false;
165: }
166: skipSpaces();
167: }
168: if (document.getTrailer() == null) {
169: COSDictionary trailer = new COSDictionary();
170: Iterator xrefIter = document.getObjectsByType(
171: "XRef").iterator();
172: while (xrefIter.hasNext()) {
173: COSStream next = (COSStream) ((COSObject) xrefIter
174: .next()).getObject();
175: trailer.addAll(next);
176: }
177: document.setTrailer(trailer);
178: }
179: if (!document.isEncrypted()) {
180: document.dereferenceObjectStreams();
181: }
182: } catch (IOException e) {
183: if (wasLastParsedObjectAnXref) {
184: //Then we assume that there is just random garbage after
185: //the xref, not sure why the PDF spec allows this but it does.
186: } else {
187: //some other error so just pass it along
188: throw e;
189: }
190: }
191: } catch (Throwable t) {
192: //so if the PDF is corrupt then close the document and clear
193: //all resources to it
194: if (document != null) {
195: document.close();
196: }
197: if (t instanceof IOException) {
198: throw (IOException) t;
199: } else {
200: throw new WrappedIOException(t);
201: }
202: } finally {
203: pdfSource.close();
204: }
205: }
206:
207: /**
208: * This will skip a header's binary fill bytes. This is in accordance to
209: * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
210: *
211: * @throws IOException If there is an error reading from the stream.
212: */
213: protected void skipHeaderFillBytes() throws IOException {
214: skipSpaces();
215: int c = pdfSource.peek();
216:
217: if (!Character.isDigit((char) c)) {
218: // Fill bytes conform with PDF reference (but without comment sign)
219: // => skip until EOL
220: readLine();
221: }
222: // else: no fill bytes
223: }
224:
225: /**
226: * This will get the document that was parsed. parse() must be called before this is called.
227: * When you are done with this document you must call close() on it to release
228: * resources.
229: *
230: * @return The document that was parsed.
231: *
232: * @throws IOException If there is an error getting the document.
233: */
234: public COSDocument getDocument() throws IOException {
235: if (document == null) {
236: throw new IOException(
237: "You must call parse() before calling getDocument()");
238: }
239: return document;
240: }
241:
242: /**
243: * This will get the PD document that was parsed. When you are done with
244: * this document you must call close() on it to release resources.
245: *
246: * @return The document at the PD layer.
247: *
248: * @throws IOException If there is an error getting the document.
249: */
250: public PDDocument getPDDocument() throws IOException {
251: return new PDDocument(getDocument());
252: }
253:
254: /**
255: * This will get the FDF document that was parsed. When you are done with
256: * this document you must call close() on it to release resources.
257: *
258: * @return The document at the PD layer.
259: *
260: * @throws IOException If there is an error getting the document.
261: */
262: public FDFDocument getFDFDocument() throws IOException {
263: return new FDFDocument(getDocument());
264: }
265:
266: /**
267: * This will parse a document object from the stream.
268: *
269: * @return The parsed object.
270: *
271: * @throws IOException If an IO error occurs.
272: */
273: private Object parseObject() throws IOException {
274: Object object = null;
275: skipSpaces();
276: char peekedChar = (char) pdfSource.peek();
277: while (peekedChar == 'e') {
278: //there are times when there are multiple endobj, so lets
279: //just read them and move on.
280: readString();
281: skipSpaces();
282: peekedChar = (char) pdfSource.peek();
283: }
284: if (pdfSource.isEOF()) {
285: //"Skipping because of EOF" );
286: //end of file we will return a null object and call it a day.
287: } else if (peekedChar == 'x' || peekedChar == 't'
288: || peekedChar == 's') {
289: //System.out.println( "parseObject() parsing xref" );
290:
291: //FDF documents do not always have the xref
292: if (peekedChar == 'x' || peekedChar == 't') {
293: object = parseXrefSection();
294: }
295:
296: //if peeked char is xref or startxref
297: if (peekedChar == 'x' || peekedChar == 's') {
298: skipSpaces();
299: while (pdfSource.peek() == 'x') {
300: parseXrefSection();
301: }
302: String startxref = readString();
303: if (!startxref.equals("startxref")) {
304: throw new IOException(
305: "expected='startxref' actual='" + startxref
306: + "' " + pdfSource);
307: }
308: skipSpaces();
309: //read some integer that is in the stream but PDFBox doesn't use
310: readInt();
311: }
312:
313: //This MUST be readLine because readString strips out comments
314: //and it will think that %% is a comment in from of the EOF
315: String eof = readExpectedString("%%EOF");
316: if (eof.indexOf("%%EOF") == -1 && !pdfSource.isEOF()) {
317: throw new IOException("expected='%%EOF' actual='" + eof
318: + "' next=" + readString() + " next="
319: + readString());
320: } else if (!pdfSource.isEOF()) {
321: //we might really be at the end of the file, there might just be some crap at the
322: //end of the file.
323: pdfSource.fillBuffer();
324: if (pdfSource.available() < 1000) {
325: //We need to determine if we are at the end of the file.
326: byte[] data = new byte[1000];
327:
328: int amountRead = pdfSource.read(data);
329: if (amountRead != -1) {
330: pdfSource.unread(data, 0, amountRead);
331: }
332: boolean atEndOfFile = true;//we assume yes unless we find another.
333: for (int i = 0; i < amountRead - 3 && atEndOfFile; i++) {
334: atEndOfFile = !(data[i] == 'E'
335: && data[i + 1] == 'O' && data[i + 2] == 'F');
336: }
337: if (atEndOfFile) {
338: while (pdfSource.read(data, 0, data.length) != -1) {
339: //read until done.
340: }
341: }
342: }
343: }
344: } else {
345: int number = -1;
346: int genNum = -1;
347: String objectKey = null;
348: boolean missingObjectNumber = false;
349: try {
350: char peeked = (char) pdfSource.peek();
351: if (peeked == '<') {
352: missingObjectNumber = true;
353: } else {
354: number = readInt();
355: }
356: } catch (IOException e) {
357: //ok for some reason "GNU Ghostscript 5.10" puts two endobj
358: //statements after an object, of course this is nonsense
359: //but because we want to support as many PDFs as possible
360: //we will simply try again
361: number = readInt();
362: }
363: if (!missingObjectNumber) {
364: skipSpaces();
365: genNum = readInt();
366:
367: objectKey = readString(3);
368: //System.out.println( "parseObject() num=" + number +
369: //" genNumber=" + genNum + " key='" + objectKey + "'" );
370: if (!objectKey.equals("obj")) {
371: throw new IOException("expected='obj' actual='"
372: + objectKey + "' " + pdfSource);
373: }
374: } else {
375: number = -1;
376: genNum = -1;
377: }
378:
379: skipSpaces();
380: COSBase pb = parseDirObject();
381: String endObjectKey = readString();
382: if (endObjectKey.equals("stream")) {
383: pdfSource.unread(endObjectKey.getBytes());
384: pdfSource.unread(' ');
385: if (pb instanceof COSDictionary) {
386: pb = parseCOSStream((COSDictionary) pb,
387: getDocument().getScratchFile());
388: } else {
389: // this is not legal
390: // the combination of a dict and the stream/endstream forms a complete stream object
391: throw new IOException(
392: "stream not preceded by dictionary");
393: }
394: endObjectKey = readString();
395: }
396: COSObjectKey key = new COSObjectKey(number, genNum);
397: COSObject pdfObject = document.getObjectFromPool(key);
398: object = pdfObject;
399: pdfObject.setObject(pb);
400:
401: if (!endObjectKey.equals("endobj")) {
402: if (!pdfSource.isEOF()) {
403: try {
404: //It is possible that the endobj is missing, there
405: //are several PDFs out there that do that so skip it and move on.
406: Float.parseFloat(endObjectKey);
407: pdfSource.unread(SPACE_BYTE);
408: pdfSource.unread(endObjectKey.getBytes());
409: } catch (NumberFormatException e) {
410: //we will try again incase there was some garbage which
411: //some writers will leave behind.
412: String secondEndObjectKey = readString();
413: if (!secondEndObjectKey.equals("endobj")) {
414: if (isClosing()) {
415: //found a case with 17506.pdf object 41 that was like this
416: //41 0 obj [/Pattern /DeviceGray] ] endobj
417: //notice the second array close, here we are reading it
418: //and ignoring and attempting to continue
419: pdfSource.read();
420: }
421: skipSpaces();
422: String thirdPossibleEndObj = readString();
423: if (!thirdPossibleEndObj.equals("endobj")) {
424: throw new IOException(
425: "expected='endobj' firstReadAttempt='"
426: + endObjectKey + "' "
427: + "secondReadAttempt='"
428: + secondEndObjectKey
429: + "' " + pdfSource);
430: }
431: }
432: }
433: }
434: }
435: skipSpaces();
436:
437: }
438: //System.out.println( "parsed=" + object );
439: return object;
440: }
441:
442: /**
443: * This will parse the xref table and trailers from the stream.
444: *
445: * @return a new PDFXref
446: *
447: * @throws IOException If an IO error occurs.
448: */
449: protected PDFXref parseXrefSection() throws IOException {
450: int[] params = new int[2];
451: parseXrefTable(params);
452: parseTrailer();
453:
454: return new PDFXref(params[0], params[1]);
455: }
456:
457: /**
458: * This will parse the xref table from the stream.
459: *
460: * It stores the starting object number and the count
461: *
462: * @param params The start and count parameters
463: *
464: * @throws IOException If an IO error occurs.
465: */
466: protected void parseXrefTable(int[] params) throws IOException {
467: String nextLine = null;
468:
469: nextLine = readLine();
470: if (nextLine.equals("xref")) {
471: params[0] = readInt();
472: params[1] = readInt();
473: nextLine = readString();
474: }
475: skipSpaces();
476: while (!nextLine.equals("trailer") && !pdfSource.isEOF()
477: && !isEndOfName((char) pdfSource.peek())) {
478: //skip past all the xref entries.
479: nextLine = readString();
480: skipSpaces();
481: }
482: skipSpaces();
483: }
484:
485: private void parseTrailer() throws IOException {
486: COSDictionary parsedTrailer = parseCOSDictionary();
487: COSDictionary docTrailer = document.getTrailer();
488: if (docTrailer == null) {
489: document.setTrailer(parsedTrailer);
490: } else {
491: docTrailer.addAll(parsedTrailer);
492: }
493: }
494: }
|