0001: /**
0002: * Copyright (c) 2003-2006, www.pdfbox.org
0003: * All rights reserved.
0004: *
0005: * Redistribution and use in source and binary forms, with or without
0006: * modification, are permitted provided that the following conditions are met:
0007: *
0008: * 1. Redistributions of source code must retain the above copyright notice,
0009: * this list of conditions and the following disclaimer.
0010: * 2. Redistributions in binary form must reproduce the above copyright notice,
0011: * this list of conditions and the following disclaimer in the documentation
0012: * and/or other materials provided with the distribution.
0013: * 3. Neither the name of pdfbox; nor the names of its
0014: * contributors may be used to endorse or promote products derived from this
0015: * software without specific prior written permission.
0016: *
0017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
0018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
0021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
0024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0027: *
0028: * http://www.pdfbox.org
0029: *
0030: */package org.pdfbox.pdfparser;
0031:
0032: import java.io.BufferedInputStream;
0033: import java.io.InputStream;
0034: import java.io.IOException;
0035: import java.io.OutputStream;
0036:
0037: import java.util.ArrayList;
0038: import java.util.List;
0039:
0040: import org.pdfbox.io.ByteArrayPushBackInputStream;
0041: import org.pdfbox.io.PushBackInputStream;
0042: import org.pdfbox.io.RandomAccess;
0043:
0044: import org.pdfbox.cos.COSArray;
0045: import org.pdfbox.cos.COSBase;
0046: import org.pdfbox.cos.COSBoolean;
0047: import org.pdfbox.cos.COSDictionary;
0048: import org.pdfbox.cos.COSDocument;
0049: import org.pdfbox.cos.COSInteger;
0050: import org.pdfbox.cos.COSName;
0051: import org.pdfbox.cos.COSNull;
0052: import org.pdfbox.cos.COSNumber;
0053: import org.pdfbox.cos.COSObject;
0054: import org.pdfbox.cos.COSStream;
0055: import org.pdfbox.cos.COSString;
0056:
0057: import org.pdfbox.persistence.util.COSObjectKey;
0058:
0059: /**
0060: * This class is used to contain parsing logic that will be used by both the
0061: * PDFParser and the COSStreamParser.
0062: *
0063: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
0064: * @version $Revision: 1.59 $
0065: */
0066: public abstract class BaseParser {
0067: /**
0068: * This is a byte array that will be used for comparisons.
0069: */
0070: public static final byte[] ENDSTREAM = new byte[] { 101, 110, 100,
0071: 115, 116, 114, 101, 97, 109 };//"endstream".getBytes( "ISO-8859-1" );
0072:
0073: /**
0074: * This is a byte array that will be used for comparisons.
0075: */
0076: public static final String DEF = "def";
0077:
0078: /**
0079: * This is the stream that will be read from.
0080: */
0081: //protected PushBackByteArrayStream pdfSource;
0082: protected PushBackInputStream pdfSource;
0083:
0084: /**
0085: * moved xref here, is a persistence construct
0086: * maybe not needed anyway when not read from behind with delayed
0087: * access to objects.
0088: */
0089: private List xrefs = new ArrayList();
0090:
0091: private COSDocument document;
0092:
0093: /**
0094: * Constructor.
0095: *
0096: * @param input The input stream to read the data from.
0097: *
0098: * @throws IOException If there is an error reading the input stream.
0099: */
0100: public BaseParser(InputStream input) throws IOException {
0101: //pdfSource = new PushBackByteArrayStream( input );
0102: pdfSource = new PushBackInputStream(new BufferedInputStream(
0103: input, 16384), 4096);
0104: }
0105:
0106: /**
0107: * Constructor.
0108: *
0109: * @param input The array to read the data from.
0110: *
0111: * @throws IOException If there is an error reading the byte data.
0112: */
0113: protected BaseParser(byte[] input) throws IOException {
0114: pdfSource = new ByteArrayPushBackInputStream(input);
0115: }
0116:
0117: /**
0118: * Set the document for this stream.
0119: *
0120: * @param doc The current document.
0121: */
0122: public void setDocument(COSDocument doc) {
0123: document = doc;
0124: }
0125:
0126: private static boolean isHexDigit(char ch) {
0127: return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f')
0128: || (ch >= 'A' && ch <= 'F');
0129: // the line below can lead to problems with certain versions of the IBM JIT compiler
0130: // (and is slower anyway)
0131: //return (HEXDIGITS.indexOf(ch) != -1);
0132: }
0133:
0134: /**
0135: * This will parse a PDF dictionary value.
0136: *
0137: * @return The parsed Dictionary object.
0138: *
0139: * @throws IOException If there is an error parsing the dictionary object.
0140: */
0141: private COSBase parseCOSDictionaryValue() throws IOException {
0142: COSBase retval = null;
0143: COSBase number = parseDirObject();
0144: skipSpaces();
0145: char next = (char) pdfSource.peek();
0146: if (next >= '0' && next <= '9') {
0147: COSBase generationNumber = parseDirObject();
0148: skipSpaces();
0149: char r = (char) pdfSource.read();
0150: if (r != 'R') {
0151: throw new IOException("expected='R' actual='" + r
0152: + "' " + pdfSource);
0153: }
0154: COSObjectKey key = new COSObjectKey(((COSInteger) number)
0155: .intValue(), ((COSInteger) generationNumber)
0156: .intValue());
0157: retval = document.getObjectFromPool(key);
0158: } else {
0159: retval = number;
0160: }
0161: return retval;
0162: }
0163:
0164: /**
0165: * This will parse a PDF dictionary.
0166: *
0167: * @return The parsed dictionary.
0168: *
0169: * @throws IOException IF there is an error reading the stream.
0170: */
0171: protected COSDictionary parseCOSDictionary() throws IOException {
0172: char c = (char) pdfSource.read();
0173: if (c != '<') {
0174: throw new IOException("expected='<' actual='" + c + "'");
0175: }
0176: c = (char) pdfSource.read();
0177: if (c != '<') {
0178: throw new IOException("expected='<' actual='" + c + "' "
0179: + pdfSource);
0180: }
0181: skipSpaces();
0182: COSDictionary obj = new COSDictionary();
0183: boolean done = false;
0184: while (!done) {
0185: skipSpaces();
0186: c = (char) pdfSource.peek();
0187: if (c == '>') {
0188: done = true;
0189: } else {
0190: COSName key = parseCOSName();
0191: COSBase value = parseCOSDictionaryValue();
0192: skipSpaces();
0193: if (((char) pdfSource.peek()) == 'd') {
0194: //if the next string is 'def' then we are parsing a cmap stream
0195: //and want to ignore it, otherwise throw an exception.
0196: String potentialDEF = readString();
0197: if (!potentialDEF.equals(DEF)) {
0198: pdfSource.unread(potentialDEF.getBytes());
0199: } else {
0200: skipSpaces();
0201: }
0202: }
0203:
0204: if (value == null) {
0205: throw new IOException("Bad Dictionary Declaration "
0206: + pdfSource);
0207: }
0208: obj.setItem(key, value);
0209: }
0210: }
0211: char ch = (char) pdfSource.read();
0212: if (ch != '>') {
0213: throw new IOException("expected='>' actual='" + ch + "'");
0214: }
0215: ch = (char) pdfSource.read();
0216: if (ch != '>') {
0217: throw new IOException("expected='>' actual='" + ch + "'");
0218: }
0219: return obj;
0220: }
0221:
0222: /**
0223: * This will read a COSStream from the input stream.
0224: *
0225: * @param file The file to write the stream to when reading.
0226: * @param dic The dictionary that goes with this stream.
0227: *
0228: * @return The parsed pdf stream.
0229: *
0230: * @throws IOException If there is an error reading the stream.
0231: */
0232: protected COSStream parseCOSStream(COSDictionary dic,
0233: RandomAccess file) throws IOException {
0234: COSStream stream = new COSStream(dic, file);
0235: OutputStream out = null;
0236: try {
0237: String streamString = readString();
0238: //long streamLength;
0239:
0240: if (!streamString.equals("stream")) {
0241: throw new IOException("expected='stream' actual='"
0242: + streamString + "'");
0243: }
0244:
0245: //PDF Ref 3.2.7 A stream must be followed by either
0246: //a CRLF or LF but nothing else.
0247:
0248: int whitespace = pdfSource.read();
0249:
0250: //see brother_scan_cover.pdf, it adds whitespaces
0251: //after the stream but before the start of the
0252: //data, so just read those first
0253: while (whitespace == 0x20) {
0254: whitespace = pdfSource.read();
0255: }
0256:
0257: if (whitespace == 0x0D) {
0258: whitespace = pdfSource.read();
0259: if (whitespace != 0x0A) {
0260: pdfSource.unread(whitespace);
0261: //The spec says this is invalid but it happens in the real
0262: //world so we must support it.
0263: //throw new IOException("expected='0x0A' actual='0x" +
0264: // Integer.toHexString(whitespace) + "' " + pdfSource);
0265: }
0266: } else if (whitespace == 0x0A) {
0267: //that is fine
0268: } else {
0269: //we are in an error.
0270: //but again we will do a lenient parsing and just assume that everything
0271: //is fine
0272: pdfSource.unread(whitespace);
0273: //throw new IOException("expected='0x0D or 0x0A' actual='0x" +
0274: //Integer.toHexString(whitespace) + "' " + pdfSource);
0275:
0276: }
0277:
0278: COSBase streamLength = dic
0279: .getDictionaryObject(COSName.LENGTH);
0280: /*long length = -1;
0281: if( streamLength instanceof COSNumber )
0282: {
0283: length = ((COSNumber)streamLength).intValue();
0284: }
0285: else if( streamLength instanceof COSObject &&
0286: ((COSObject)streamLength).getObject() instanceof COSNumber )
0287: {
0288: length = ((COSNumber)((COSObject)streamLength).getObject()).intValue();
0289: }*/
0290:
0291: //length = -1;
0292: //streamLength = null;
0293: //Need to keep track of the
0294: out = stream.createFilteredStream(streamLength);
0295: String endStream = null;
0296: //the length is wrong in some pdf documents which means
0297: //that PDFBox must basically ignore it in order to be able to read
0298: //the most number of PDF documents. This of course is a penalty hit,
0299: //maybe I could implement a faster parser.
0300: /**if( length != -1 )
0301: {
0302: byte[] buffer = new byte[1024];
0303: int amountRead = 0;
0304: int totalAmountRead = 0;
0305: while( amountRead != -1 && totalAmountRead < length )
0306: {
0307: int maxAmountToRead = Math.min(buffer.length, (int)(length-totalAmountRead));
0308: amountRead = pdfSource.read(buffer,0,maxAmountToRead);
0309: totalAmountRead += amountRead;
0310: if( amountRead != -1 )
0311: {
0312: out.write( buffer, 0, amountRead );
0313: }
0314: }
0315: }
0316: else
0317: {**/
0318: readUntilEndStream(out);
0319: /**}*/
0320: skipSpaces();
0321: endStream = readString();
0322:
0323: if (!endStream.equals("endstream")) {
0324: readUntilEndStream(out);
0325: endStream = readString();
0326: if (!endStream.equals("endstream")) {
0327: throw new IOException(
0328: "expected='endstream' actual='" + endStream
0329: + "' " + pdfSource);
0330: }
0331: }
0332: } finally {
0333: if (out != null) {
0334: out.close();
0335: }
0336: }
0337: return stream;
0338: }
0339:
0340: private void readUntilEndStream(OutputStream out)
0341: throws IOException {
0342: int currentIndex = 0;
0343: int byteRead = 0;
0344: //this is the additional bytes buffered but not written
0345: int additionalBytes = 0;
0346: byte[] buffer = new byte[ENDSTREAM.length + additionalBytes];
0347: int writeIndex = 0;
0348: while (!cmpCircularBuffer(buffer, currentIndex, ENDSTREAM)
0349: && byteRead != -1) {
0350: writeIndex = currentIndex - buffer.length;
0351: if (writeIndex >= 0) {
0352: out.write(buffer[writeIndex % buffer.length]);
0353: }
0354: byteRead = pdfSource.read();
0355: buffer[currentIndex % buffer.length] = (byte) byteRead;
0356: currentIndex++;
0357: }
0358:
0359: //we want to ignore the end of the line data when reading a stream
0360: //so will make an attempt to ignore it.
0361: /*writeIndex = currentIndex - buffer.length;
0362: if( buffer[writeIndex%buffer.length] == 13 &&
0363: buffer[(writeIndex+1)%buffer.length] == 10 )
0364: {
0365: //then ignore the newline before the endstream
0366: }
0367: else if( buffer[(writeIndex+1)%buffer.length] == 10 )
0368: {
0369: //Then first byte is data, second byte is newline
0370: out.write( buffer[writeIndex%buffer.length] );
0371: }
0372: else
0373: {
0374: out.write( buffer[writeIndex%buffer.length] );
0375: out.write( buffer[(writeIndex+1)%buffer.length] );
0376: }*/
0377:
0378: /**
0379: * Old way of handling newlines before endstream
0380: for( int i=0; i<additionalBytes; i++ )
0381: {
0382: writeIndex = currentIndex - buffer.length;
0383: if( writeIndex >=0 &&
0384: //buffer[writeIndex%buffer.length] != 10 &&
0385: buffer[writeIndex%buffer.length] != 13 )
0386: {
0387: out.write( buffer[writeIndex%buffer.length] );
0388: }
0389: currentIndex++;
0390: }
0391: */
0392: pdfSource.unread(ENDSTREAM);
0393:
0394: }
0395:
0396: /**
0397: * This basically checks to see if the next compareTo.length bytes of the
0398: * buffer match the compareTo byte array.
0399: */
0400: private boolean cmpCircularBuffer(byte[] buffer, int currentIndex,
0401: byte[] compareTo) {
0402: int cmpLen = compareTo.length;
0403: int buflen = buffer.length;
0404: boolean match = true;
0405: int off = currentIndex - cmpLen;
0406: if (off < 0) {
0407: match = false;
0408: }
0409: for (int i = 0; match && i < cmpLen; ++i) {
0410: match = buffer[(off + i) % buflen] == compareTo[i];
0411: }
0412: return match;
0413: }
0414:
0415: /**
0416: * This will parse a PDF string.
0417: *
0418: * @return The parsed PDF string.
0419: *
0420: * @throws IOException If there is an error reading from the stream.
0421: */
0422: protected COSString parseCOSString() throws IOException {
0423: char nextChar = (char) pdfSource.read();
0424: COSString retval = new COSString();
0425: char openBrace;
0426: char closeBrace;
0427: if (nextChar == '(') {
0428: openBrace = '(';
0429: closeBrace = ')';
0430: } else if (nextChar == '<') {
0431: openBrace = '<';
0432: closeBrace = '>';
0433: } else {
0434: throw new IOException(
0435: "parseCOSString string should start with '(' or '<' and not '"
0436: + nextChar + "' " + pdfSource);
0437: }
0438:
0439: //This is the number of braces read
0440: //
0441: int braces = 1;
0442: int c = pdfSource.read();
0443: while (braces > 0 && c != -1) {
0444: char ch = (char) c;
0445: int nextc = -2; // not yet read
0446: //if( log.isDebugEnabled() )
0447: //{
0448: // log.debug( "Parsing COSString character '" + c + "' code=" + (int)c );
0449: //}
0450:
0451: if (ch == closeBrace) {
0452: braces--;
0453: byte[] nextThreeBytes = new byte[3];
0454: int amountRead = pdfSource.read(nextThreeBytes);
0455:
0456: //lets handle the special case seen in Bull River Rules and Regulations.pdf
0457: //The dictionary looks like this
0458: // 2 0 obj
0459: // <<
0460: // /Type /Info
0461: // /Creator (PaperPort http://www.scansoft.com)
0462: // /Producer (sspdflib 1.0 http://www.scansoft.com)
0463: // /Title ( (5)
0464: // /Author ()
0465: // /Subject ()
0466: //
0467: // Notice the /Title, the braces are not even but they should
0468: // be. So lets assume that if we encounter an this scenario
0469: // <end_brace><new_line><opening_slash> then that
0470: // means that there is an error in the pdf and assume that
0471: // was the end of the document.
0472: if (amountRead == 3) {
0473: if (nextThreeBytes[0] == 0x0d
0474: && nextThreeBytes[1] == 0x0a
0475: && nextThreeBytes[2] == 0x2f) {
0476: braces = 0;
0477: }
0478: }
0479: pdfSource.unread(nextThreeBytes, 0, amountRead);
0480: if (braces != 0) {
0481: retval.append(ch);
0482: }
0483: } else if (ch == openBrace) {
0484: braces++;
0485: retval.append(ch);
0486: } else if (ch == '\\') {
0487: //patched by ram
0488: char next = (char) pdfSource.read();
0489: switch (next) {
0490: case 'n':
0491: retval.append('\n');
0492: break;
0493: case 'r':
0494: retval.append('\r');
0495: break;
0496: case 't':
0497: retval.append('\t');
0498: break;
0499: case 'b':
0500: retval.append('\b');
0501: break;
0502: case 'f':
0503: retval.append('\f');
0504: break;
0505: case '(':
0506: case ')':
0507: case '\\':
0508: retval.append(next);
0509: break;
0510: case 10:
0511: case 13:
0512: //this is a break in the line so ignore it and the newline and continue
0513: c = pdfSource.read();
0514: while (isEOL(c) && c != -1) {
0515: c = pdfSource.read();
0516: }
0517: nextc = c;
0518: break;
0519: case '0':
0520: case '1':
0521: case '2':
0522: case '3':
0523: case '4':
0524: case '5':
0525: case '6':
0526: case '7': {
0527: StringBuffer octal = new StringBuffer();
0528: octal.append(next);
0529: c = pdfSource.read();
0530: char digit = (char) c;
0531: if (digit >= '0' && digit <= '7') {
0532: octal.append(digit);
0533: c = pdfSource.read();
0534: digit = (char) c;
0535: if (digit >= '0' && digit <= '7') {
0536: octal.append(digit);
0537: } else {
0538: nextc = c;
0539: }
0540: } else {
0541: nextc = c;
0542: }
0543:
0544: int character = 0;
0545: try {
0546: character = Integer.parseInt(octal.toString(),
0547: 8);
0548: } catch (NumberFormatException e) {
0549: throw new IOException(
0550: "Error: Expected octal character, actual='"
0551: + octal + "'");
0552: }
0553: retval.append(character);
0554: break;
0555: }
0556: default: {
0557: retval.append('\\');
0558: retval.append(next);
0559: //another ficken problem with PDF's, sometimes the \ doesn't really
0560: //mean escape like the PDF spec says it does, sometimes is should be literal
0561: //which is what we will assume here.
0562: //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
0563: }
0564: }
0565: } else {
0566: if (openBrace == '<') {
0567: if (isHexDigit(ch)) {
0568: retval.append(ch);
0569: }
0570: } else {
0571: retval.append(ch);
0572: }
0573: }
0574: if (nextc != -2) {
0575: c = nextc;
0576: } else {
0577: c = pdfSource.read();
0578: }
0579: }
0580: if (c != -1) {
0581: pdfSource.unread(c);
0582: }
0583: if (openBrace == '<') {
0584: retval = COSString.createFromHexString(retval.getString());
0585: }
0586: return retval;
0587: }
0588:
0589: /**
0590: * This will parse a PDF array object.
0591: *
0592: * @return The parsed PDF array.
0593: *
0594: * @throws IOException If there is an error parsing the stream.
0595: */
0596: protected COSArray parseCOSArray() throws IOException {
0597: char ch = (char) pdfSource.read();
0598: if (ch != '[') {
0599: throw new IOException("expected='[' actual='" + ch + "'");
0600: }
0601: COSArray po = new COSArray();
0602: COSBase pbo = null;
0603: skipSpaces();
0604: int i = 0;
0605: while (((i = pdfSource.peek()) > 0) && ((char) i != ']')) {
0606: pbo = parseDirObject();
0607: if (pbo instanceof COSObject) {
0608: COSInteger genNumber = (COSInteger) po
0609: .remove(po.size() - 1);
0610: COSInteger number = (COSInteger) po
0611: .remove(po.size() - 1);
0612: COSObjectKey key = new COSObjectKey(number.intValue(),
0613: genNumber.intValue());
0614: pbo = document.getObjectFromPool(key);
0615: }
0616: if (pbo != null) {
0617: po.add(pbo);
0618: } else {
0619: //it could be a bad object in the array which is just skipped
0620: }
0621: skipSpaces();
0622: }
0623: pdfSource.read(); //read ']'
0624: skipSpaces();
0625: return po;
0626: }
0627:
0628: /**
0629: * Determine if a character terminates a PDF name.
0630: *
0631: * @param ch The character
0632: * @return <code>true</code> if the character terminates a PDF name, otherwise <code>false</code>.
0633: */
0634: protected boolean isEndOfName(char ch) {
0635: return (ch == ' ' || ch == 13 || ch == 10 || ch == 9
0636: || ch == '>' || ch == '<' || ch == '[' || ch == '/'
0637: || ch == ']' || ch == ')' || ch == '(' || ch == -1 //EOF
0638: );
0639: }
0640:
0641: /**
0642: * This will parse a PDF name from the stream.
0643: *
0644: * @return The parsed PDF name.
0645: *
0646: * @throws IOException If there is an error reading from the stream.
0647: */
0648: protected COSName parseCOSName() throws IOException {
0649: COSName retval = null;
0650: int c = pdfSource.read();
0651: if ((char) c != '/') {
0652: throw new IOException("expected='/' actual='" + (char) c
0653: + "'-" + c + " " + pdfSource);
0654: }
0655: // costruisce il nome
0656: StringBuffer buffer = new StringBuffer();
0657: c = pdfSource.read();
0658: while (c != -1) {
0659: char ch = (char) c;
0660: if (ch == '#') {
0661: char ch1 = (char) pdfSource.read();
0662: char ch2 = (char) pdfSource.read();
0663:
0664: // Prior to PDF v1.2, the # was not a special character. Also,
0665: // it has been observed that various PDF tools do not follow the
0666: // spec with respect to the # escape, even though they report
0667: // PDF versions of 1.2 or later. The solution here is that we
0668: // interpret the # as an escape only when it is followed by two
0669: // valid hex digits.
0670: //
0671: if (isHexDigit(ch1) && isHexDigit(ch2)) {
0672: String hex = "" + ch1 + ch2;
0673: try {
0674: buffer.append((char) Integer.parseInt(hex, 16));
0675: } catch (NumberFormatException e) {
0676: throw new IOException(
0677: "Error: expected hex number, actual='"
0678: + hex + "'");
0679: }
0680: c = pdfSource.read();
0681: } else {
0682: pdfSource.unread(ch2);
0683: c = ch1;
0684: buffer.append(ch);
0685: }
0686: } else if (isEndOfName(ch)) {
0687: break;
0688: } else {
0689: buffer.append(ch);
0690: c = pdfSource.read();
0691: }
0692: }
0693: if (c != -1) {
0694: pdfSource.unread(c);
0695: }
0696: retval = COSName.getPDFName(buffer.toString());
0697: return retval;
0698: }
0699:
0700: /**
0701: * This will parse a boolean object from the stream.
0702: *
0703: * @return The parsed boolean object.
0704: *
0705: * @throws IOException If an IO error occurs during parsing.
0706: */
0707: protected COSBoolean parseBoolean() throws IOException {
0708: COSBoolean retval = null;
0709: char c = (char) pdfSource.peek();
0710: if (c == 't') {
0711: byte[] trueArray = new byte[4];
0712: int amountRead = pdfSource.read(trueArray, 0, 4);
0713: String trueString = new String(trueArray, 0, amountRead);
0714: if (!trueString.equals("true")) {
0715: throw new IOException(
0716: "Error parsing boolean: expected='true' actual='"
0717: + trueString + "'");
0718: } else {
0719: retval = COSBoolean.TRUE;
0720: }
0721: } else if (c == 'f') {
0722: byte[] falseArray = new byte[5];
0723: int amountRead = pdfSource.read(falseArray, 0, 5);
0724: String falseString = new String(falseArray, 0, amountRead);
0725: if (!falseString.equals("false")) {
0726: throw new IOException(
0727: "Error parsing boolean: expected='true' actual='"
0728: + falseString + "'");
0729: } else {
0730: retval = COSBoolean.FALSE;
0731: }
0732: } else {
0733: throw new IOException(
0734: "Error parsing boolean expected='t or f' actual='"
0735: + c + "'");
0736: }
0737: return retval;
0738: }
0739:
0740: /**
0741: * This will parse a directory object from the stream.
0742: *
0743: * @return The parsed object.
0744: *
0745: * @throws IOException If there is an error during parsing.
0746: */
0747: protected COSBase parseDirObject() throws IOException {
0748: COSBase retval = null;
0749:
0750: skipSpaces();
0751: int nextByte = pdfSource.peek();
0752: char c = (char) nextByte;
0753: switch (c) {
0754: case '<': {
0755: int leftBracket = pdfSource.read();//pull off first left bracket
0756: c = (char) pdfSource.peek(); //check for second left bracket
0757: pdfSource.unread(leftBracket);
0758: if (c == '<') {
0759:
0760: retval = parseCOSDictionary();
0761: skipSpaces();
0762: } else {
0763: retval = parseCOSString();
0764: }
0765: break;
0766: }
0767: case '[': // array
0768: {
0769: retval = parseCOSArray();
0770: break;
0771: }
0772: case '(':
0773: retval = parseCOSString();
0774: break;
0775: case '/': // name
0776: retval = parseCOSName();
0777: break;
0778: case 'n': // null
0779: {
0780: String nullString = readString();
0781: if (!nullString.equals("null")) {
0782: throw new IOException("Expected='null' actual='"
0783: + nullString + "'");
0784: }
0785: retval = COSNull.NULL;
0786: break;
0787: }
0788: case 't': {
0789: byte[] trueBytes = new byte[4];
0790: int amountRead = pdfSource.read(trueBytes, 0, 4);
0791: String trueString = new String(trueBytes, 0, amountRead);
0792: if (trueString.equals("true")) {
0793: retval = COSBoolean.TRUE;
0794: } else {
0795: throw new IOException("expected true actual='"
0796: + trueString + "' " + pdfSource);
0797: }
0798: break;
0799: }
0800: case 'f': {
0801: byte[] falseBytes = new byte[5];
0802: int amountRead = pdfSource.read(falseBytes, 0, 5);
0803: String falseString = new String(falseBytes, 0, amountRead);
0804: if (falseString.equals("false")) {
0805: retval = COSBoolean.FALSE;
0806: } else {
0807: throw new IOException("expected false actual='"
0808: + falseString + "' " + pdfSource);
0809: }
0810: break;
0811: }
0812: case 'R':
0813: pdfSource.read();
0814: retval = new COSObject(null);
0815: break;
0816: case (char) -1:
0817: return null;
0818: default: {
0819: if (Character.isDigit(c) || c == '-' || c == '+'
0820: || c == '.') {
0821: StringBuffer buf = new StringBuffer();
0822: int ic = pdfSource.read();
0823: c = (char) ic;
0824: while (Character.isDigit(c) || c == '-' || c == '+'
0825: || c == '.' || c == 'E' || c == 'e') {
0826: buf.append(c);
0827: ic = pdfSource.read();
0828: c = (char) ic;
0829: }
0830: if (ic != -1) {
0831: pdfSource.unread(ic);
0832: }
0833: retval = COSNumber.get(buf.toString());
0834: } else {
0835: //This is not suppose to happen, but we will allow for it
0836: //so we are more compatible with POS writers that don't
0837: //follow the spec
0838: String badString = readString();
0839: //throw new IOException( "Unknown dir object c='" + c +
0840: //"' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
0841: if (badString == null || badString.length() == 0) {
0842: int peek = pdfSource.peek();
0843: // we can end up in an infinite loop otherwise
0844: throw new IOException("Unknown dir object c='" + c
0845: + "' cInt=" + (int) c + " peek='"
0846: + (char) peek + "' peekInt=" + peek + " "
0847: + pdfSource);
0848: }
0849:
0850: }
0851: }
0852: }
0853: return retval;
0854: }
0855:
0856: /**
0857: * This will read the next string from the stream.
0858: *
0859: * @return The string that was read from the stream.
0860: *
0861: * @throws IOException If there is an error reading from the stream.
0862: */
0863: protected String readString() throws IOException {
0864: skipSpaces();
0865: StringBuffer buffer = new StringBuffer();
0866: int c = pdfSource.read();
0867: while (!isEndOfName((char) c) && !isClosing(c) && c != -1) {
0868: buffer.append((char) c);
0869: c = pdfSource.read();
0870: }
0871: if (c != -1) {
0872: pdfSource.unread(c);
0873: }
0874: return buffer.toString();
0875: }
0876:
0877: /**
0878: * This will read bytes until the end of line marker occurs.
0879: *
0880: * @param theString The next expected string in the stream.
0881: *
0882: * @return The characters between the current position and the end of the line.
0883: *
0884: * @throws IOException If there is an error reading from the stream or theString does not match what was read.
0885: */
0886: protected String readExpectedString(String theString)
0887: throws IOException {
0888: int c = pdfSource.read();
0889: while (isWhitespace(c) && c != -1) {
0890: c = pdfSource.read();
0891: }
0892: StringBuffer buffer = new StringBuffer(theString.length());
0893: int charsRead = 0;
0894: while (!isEOL(c) && c != -1 && charsRead < theString.length()) {
0895: char next = (char) c;
0896: buffer.append(next);
0897: if (theString.charAt(charsRead) == next) {
0898: charsRead++;
0899: } else {
0900: throw new IOException("Error: Expected to read '"
0901: + theString + "' instead started reading '"
0902: + buffer.toString() + "'");
0903: }
0904: c = pdfSource.read();
0905: }
0906: while (isEOL(c) && c != -1) {
0907: c = pdfSource.read();
0908: }
0909: if (c != -1) {
0910: pdfSource.unread(c);
0911: }
0912: return buffer.toString();
0913: }
0914:
0915: /**
0916: * This will read the next string from the stream up to a certain length.
0917: *
0918: * @param length The length to stop reading at.
0919: *
0920: * @return The string that was read from the stream of length 0 to length.
0921: *
0922: * @throws IOException If there is an error reading from the stream.
0923: */
0924: protected String readString(int length) throws IOException {
0925: skipSpaces();
0926:
0927: int c = pdfSource.read();
0928:
0929: //average string size is around 2 and the normal string buffer size is
0930: //about 16 so lets save some space.
0931: StringBuffer buffer = new StringBuffer(length);
0932: while (!isWhitespace(c) && !isClosing(c) && c != -1
0933: && buffer.length() < length && c != '[' && c != '<'
0934: && c != '(' && c != '/') {
0935: buffer.append((char) c);
0936: c = pdfSource.read();
0937: }
0938: if (c != -1) {
0939: pdfSource.unread(c);
0940: }
0941: return buffer.toString();
0942: }
0943:
0944: /**
0945: * This will tell if the next character is a closing brace( close of PDF array ).
0946: *
0947: * @return true if the next byte is ']', false otherwise.
0948: *
0949: * @throws IOException If an IO error occurs.
0950: */
0951: protected boolean isClosing() throws IOException {
0952: return isClosing(pdfSource.peek());
0953: }
0954:
0955: /**
0956: * This will tell if the next character is a closing brace( close of PDF array ).
0957: *
0958: * @param c The character to check against end of line
0959: * @return true if the next byte is ']', false otherwise.
0960: */
0961: protected boolean isClosing(int c) {
0962: return c == ']';
0963: }
0964:
0965: /**
0966: * This will read bytes until the end of line marker occurs.
0967: *
0968: * @return The characters between the current position and the end of the line.
0969: *
0970: * @throws IOException If there is an error reading from the stream.
0971: */
0972: protected String readLine() throws IOException {
0973: int c = pdfSource.read();
0974: while (isWhitespace(c) && c != -1) {
0975: c = pdfSource.read();
0976: }
0977: StringBuffer buffer = new StringBuffer(11);
0978:
0979: while (!isEOL(c) && c != -1) {
0980: buffer.append((char) c);
0981: c = pdfSource.read();
0982: }
0983: while (isEOL(c) && c != -1) {
0984: c = pdfSource.read();
0985: }
0986: if (c != -1) {
0987: pdfSource.unread(c);
0988: }
0989: return buffer.toString();
0990: }
0991:
0992: /**
0993: * This will tell if the next byte to be read is an end of line byte.
0994: *
0995: * @return true if the next byte is 0x0A or 0x0D.
0996: *
0997: * @throws IOException If there is an error reading from the stream.
0998: */
0999: protected boolean isEOL() throws IOException {
1000: return isEOL(pdfSource.peek());
1001: }
1002:
1003: /**
1004: * This will tell if the next byte to be read is an end of line byte.
1005: *
1006: * @param c The character to check against end of line
1007: * @return true if the next byte is 0x0A or 0x0D.
1008: */
1009: protected boolean isEOL(int c) {
1010: return c == 10 || c == 13;
1011: }
1012:
1013: /**
1014: * This will tell if the next byte is whitespace or not.
1015: *
1016: * @return true if the next byte in the stream is a whitespace character.
1017: *
1018: * @throws IOException If there is an error reading from the stream.
1019: */
1020: protected boolean isWhitespace() throws IOException {
1021: return isWhitespace(pdfSource.peek());
1022: }
1023:
1024: /**
1025: * This will tell if the next byte is whitespace or not.
1026: *
1027: * @param c The character to check against whitespace
1028: *
1029: * @return true if the next byte in the stream is a whitespace character.
1030: */
1031: protected boolean isWhitespace(int c) {
1032: return c == 0 || c == 9 || c == 12 || c == 10 || c == 13
1033: || c == 32;
1034: }
1035:
1036: /**
1037: * This will skip all spaces and comments that are present.
1038: *
1039: * @throws IOException If there is an error reading from the stream.
1040: */
1041: protected void skipSpaces() throws IOException {
1042: //log( "skipSpaces() " + pdfSource );
1043: int c = pdfSource.read();
1044: // identical to, but faster as: isWhiteSpace(c) || c == 37
1045: while (c == 0 || c == 9 || c == 12 || c == 10 || c == 13
1046: || c == 32 || c == 37)//37 is the % character, a comment
1047: {
1048: if (c == 37) {
1049: // skip past the comment section
1050: c = pdfSource.read();
1051: while (!isEOL(c) && c != -1) {
1052: c = pdfSource.read();
1053: }
1054: } else {
1055: c = pdfSource.read();
1056: }
1057: }
1058: if (c != -1) {
1059: pdfSource.unread(c);
1060: }
1061: //log( "skipSpaces() done peek='" + (char)pdfSource.peek() + "'" );
1062: }
1063:
1064: /**
1065: * This will read an integer from the stream.
1066: *
1067: * @return The integer that was read from the stream.
1068: *
1069: * @throws IOException If there is an error reading from the stream.
1070: */
1071: protected int readInt() throws IOException {
1072: skipSpaces();
1073: int retval = 0;
1074:
1075: int lastByte = 0;
1076: StringBuffer intBuffer = new StringBuffer();
1077: while ((lastByte = pdfSource.read()) != 32 && lastByte != 10
1078: && lastByte != 13 && lastByte != 0 && //See sourceforge bug 853328
1079: lastByte != -1) {
1080: intBuffer.append((char) lastByte);
1081: }
1082: try {
1083: retval = Integer.parseInt(intBuffer.toString());
1084: } catch (NumberFormatException e) {
1085: throw new IOException(
1086: "Error: Expected an integer type, actual='"
1087: + intBuffer + "'");
1088: }
1089: return retval;
1090: }
1091:
1092: /**
1093: * This will add an xref.
1094: *
1095: * @param xref The xref to add.
1096: */
1097: public void addXref(PDFXref xref) {
1098: xrefs.add(xref);
1099: }
1100:
1101: /**
1102: * This will get all of the xrefs.
1103: *
1104: * @return A list of all xrefs.
1105: */
1106: public List getXrefs() {
1107: return xrefs;
1108: }
1109: }
|