001: /**
002: * Copyright (c) 2003-2006, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.pdfparser;
031:
032: import java.io.ByteArrayOutputStream;
033: import java.io.InputStream;
034: import java.io.IOException;
035:
036: import java.util.ArrayList;
037: import java.util.List;
038:
039: import org.pdfbox.cos.COSBase;
040: import org.pdfbox.cos.COSBoolean;
041: import org.pdfbox.cos.COSDictionary;
042: import org.pdfbox.cos.COSName;
043: import org.pdfbox.cos.COSNull;
044: import org.pdfbox.cos.COSNumber;
045: import org.pdfbox.cos.COSObject;
046: import org.pdfbox.cos.COSStream;
047: import org.pdfbox.io.RandomAccess;
048:
049: import org.pdfbox.pdmodel.common.PDStream;
050: import org.pdfbox.util.PDFOperator;
051: import org.pdfbox.util.ImageParameters;
052:
053: /**
054: * This will parse a PDF byte stream and extract operands and such.
055: *
056: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
057: * @version $Revision: 1.32 $
058: */
059: public class PDFStreamParser extends BaseParser {
060: private List streamObjects = new ArrayList(100);
061: private RandomAccess file;
062: private PDFOperator lastBIToken = null;
063:
064: /**
065: * Constructor that takes a stream to parse.
066: *
067: * @param stream The stream to read data from.
068: * @param raf The random access file.
069: *
070: * @throws IOException If there is an error reading from the stream.
071: */
072: public PDFStreamParser(InputStream stream, RandomAccess raf)
073: throws IOException {
074: super (stream);
075: file = raf;
076: }
077:
078: /**
079: * Constructor.
080: *
081: * @param stream The stream to parse.
082: *
083: * @throws IOException If there is an error initializing the stream.
084: */
085: public PDFStreamParser(PDStream stream) throws IOException {
086: this (stream.createInputStream(), stream.getStream()
087: .getScratchFile());
088: }
089:
090: /**
091: * Constructor.
092: *
093: * @param stream The stream to parse.
094: *
095: * @throws IOException If there is an error initializing the stream.
096: */
097: public PDFStreamParser(COSStream stream) throws IOException {
098: this (stream.getUnfilteredStream(), stream.getScratchFile());
099: }
100:
101: /**
102: * This will parse the tokens in the stream. This will close the
103: * stream when it is finished parsing.
104: *
105: * @throws IOException If there is an error while parsing the stream.
106: */
107: public void parse() throws IOException {
108: try {
109: Object token = null;
110: while ((token = parseNextToken()) != null) {
111: streamObjects.add(token);
112: }
113: } finally {
114: pdfSource.close();
115: }
116: }
117:
118: /**
119: * This will get the tokens that were parsed from the stream.
120: *
121: * @return All of the tokens in the stream.
122: */
123: public List getTokens() {
124: return streamObjects;
125: }
126:
127: /**
128: * This will parse the next token in the stream.
129: *
130: * @return The next token in the stream or null if there are no more tokens in the stream.
131: *
132: * @throws IOException If an io error occurs while parsing the stream.
133: */
134: private Object parseNextToken() throws IOException {
135: Object retval = null;
136:
137: skipSpaces();
138: int nextByte = pdfSource.peek();
139: if (((byte) nextByte) == -1) {
140: return null;
141: }
142: char c = (char) nextByte;
143: switch (c) {
144: case '<': {
145: int leftBracket = pdfSource.read();//pull off first left bracket
146: c = (char) pdfSource.peek(); //check for second left bracket
147: pdfSource.unread(leftBracket); //put back first bracket
148: if (c == '<') {
149:
150: COSDictionary pod = parseCOSDictionary();
151: skipSpaces();
152: if ((char) pdfSource.peek() == 's') {
153: retval = parseCOSStream(pod, file);
154: } else {
155: retval = pod;
156: }
157: } else {
158: retval = parseCOSString();
159: }
160: break;
161: }
162: case '[': // array
163: {
164: retval = parseCOSArray();
165: break;
166: }
167: case '(': // string
168: retval = parseCOSString();
169: break;
170: case '/': // name
171: retval = parseCOSName();
172: break;
173: case 'n': // null
174: {
175: String nullString = readString();
176: if (nullString.equals("null")) {
177: retval = COSNull.NULL;
178: } else {
179: retval = PDFOperator.getOperator(nullString);
180: }
181: break;
182: }
183: case 't':
184: case 'f': {
185: String next = readString();
186: if (next.equals("true")) {
187: retval = COSBoolean.TRUE;
188: break;
189: } else if (next.equals("false")) {
190: retval = COSBoolean.FALSE;
191: } else {
192: retval = PDFOperator.getOperator(next);
193: }
194: break;
195: }
196: case 'R': {
197: String line = readString();
198: if (line.equals("R")) {
199: retval = new COSObject(null);
200: } else {
201: retval = PDFOperator.getOperator(line);
202: }
203: break;
204: }
205: case '0':
206: case '1':
207: case '2':
208: case '3':
209: case '4':
210: case '5':
211: case '6':
212: case '7':
213: case '8':
214: case '9':
215: case '-':
216: case '+':
217: case '.': {
218: if (Character.isDigit(c) || c == '-' || c == '+'
219: || c == '.') {
220: StringBuffer buf = new StringBuffer();
221: while (Character.isDigit((c = (char) pdfSource.peek()))
222: || c == '-' || c == '+' || c == '.') {
223: buf.append(c);
224: pdfSource.read();
225: }
226: retval = COSNumber.get(buf.toString());
227: } else {
228: throw new IOException("Unknown dir object c='" + c
229: + "' peek='" + (char) pdfSource.peek() + "' "
230: + pdfSource);
231: }
232: break;
233: }
234: case 'B': {
235: String next = readString();
236: retval = PDFOperator.getOperator(next);
237:
238: if (next.equals("BI")) {
239: lastBIToken = (PDFOperator) retval;
240: COSDictionary imageParams = new COSDictionary();
241: lastBIToken.setImageParameters(new ImageParameters(
242: imageParams));
243: Object nextToken = null;
244: while ((nextToken = parseNextToken()) instanceof COSName) {
245: Object value = parseNextToken();
246: imageParams.setItem((COSName) nextToken,
247: (COSBase) value);
248: }
249: //final token will be the image data, maybe??
250: PDFOperator imageData = (PDFOperator) nextToken;
251: lastBIToken.setImageData(imageData.getImageData());
252: }
253: break;
254: }
255: case 'I': {
256: //ImageParameters imageParams = lastBIToken.getImageParameters();
257:
258: //int expectedBytes = (int)Math.ceil(imageParams.getHeight() * imageParams.getWidth() *
259: // (imageParams.getBitsPerComponent()/8) );
260: //Special case for ID operator
261: String id = "" + (char) pdfSource.read()
262: + (char) pdfSource.read();
263: if (!id.equals("ID")) {
264: throw new IOException(
265: "Error: Expected operator 'ID' actual='" + id
266: + "'");
267: }
268: ByteArrayOutputStream imageData = new ByteArrayOutputStream();
269: //boolean foundEnd = false;
270: if (this .isWhitespace()) {
271: //pull off the whitespace character
272: pdfSource.read();
273: }
274: int twoBytesAgo = 0;
275: int lastByte = pdfSource.read();
276: int currentByte = pdfSource.read();
277: int count = 0;
278: //PDF spec is kinda unclear about this. Should a whitespace
279: //always appear before EI? Not sure, I found a PDF
280: //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
281: //of the image data and will stop parsing prematurely if there is
282: //not a check for <whitespace>EI<whitespace>.
283: while (!(isWhitespace(twoBytesAgo) && lastByte == 'E'
284: && currentByte == 'I' && isWhitespace() //&&
285: //amyuni2_05d__pdf1_3_acro4x.pdf has image data that
286: //is compressed, so expectedBytes is useless here.
287: //count >= expectedBytes
288: )
289: && !pdfSource.isEOF()) {
290: imageData.write(lastByte);
291: twoBytesAgo = lastByte;
292: lastByte = currentByte;
293: currentByte = pdfSource.read();
294: count++;
295: }
296: pdfSource.unread('I'); //unread the EI operator
297: pdfSource.unread('E');
298: retval = PDFOperator.getOperator("ID");
299: ((PDFOperator) retval)
300: .setImageData(imageData.toByteArray());
301: break;
302: }
303: case ']': {
304: // some ']' around without its previous '['
305: // this means a PDF is somewhat corrupt but we will continue to parse.
306: pdfSource.read();
307: retval = COSNull.NULL; // must be a better solution than null...
308: break;
309: }
310: default: {
311: //we must be an operator
312: String operator = readOperator();
313: if (operator.trim().length() == 0) {
314: //we have a corrupt stream, stop reading here
315: retval = null;
316: } else {
317: retval = PDFOperator.getOperator(operator);
318: }
319: }
320:
321: }
322:
323: return retval;
324: }
325:
326: /**
327: * This will read an operator from the stream.
328: *
329: * @return The operator that was read from the stream.
330: *
331: * @throws IOException If there is an error reading from the stream.
332: */
333: protected String readOperator() throws IOException {
334: skipSpaces();
335:
336: //average string size is around 2 and the normal string buffer size is
337: //about 16 so lets save some space.
338: StringBuffer buffer = new StringBuffer(4);
339: while (!isWhitespace()
340: && !isClosing()
341: && !pdfSource.isEOF()
342: && pdfSource.peek() != (int) '['
343: && pdfSource.peek() != (int) '<'
344: && pdfSource.peek() != (int) '('
345: && pdfSource.peek() != (int) '/'
346: && (pdfSource.peek() < (int) '0' || pdfSource.peek() > (int) '9')) {
347: buffer.append((char) pdfSource.read());
348: }
349: return buffer.toString();
350: }
351: }
|