0001: /*
0002: * $Id: PDFFile.java,v 1.5 2007/12/20 18:17:41 rbair Exp $
0003: *
0004: * Copyright 2004 Sun Microsystems, Inc., 4150 Network Circle,
0005: * Santa Clara, California 95054, U.S.A. All rights reserved.
0006: *
0007: * This library is free software; you can redistribute it and/or
0008: * modify it under the terms of the GNU Lesser General Public
0009: * License as published by the Free Software Foundation; either
0010: * version 2.1 of the License, or (at your option) any later version.
0011: *
0012: * This library is distributed in the hope that it will be useful,
0013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
0015: * Lesser General Public License for more details.
0016: *
0017: * You should have received a copy of the GNU Lesser General Public
0018: * License along with this library; if not, write to the Free Software
0019: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
0020: */
0021:
0022: package com.sun.pdfview;
0023:
0024: import java.awt.geom.Rectangle2D;
0025: import java.io.File;
0026: import java.io.IOException;
0027: import java.io.RandomAccessFile;
0028: import java.nio.ByteBuffer;
0029: import java.nio.channels.FileChannel;
0030: import java.util.ArrayList;
0031: import java.util.HashMap;
0032: import java.util.Map;
0033:
0034: import com.sun.pdfview.action.GoToAction;
0035: import com.sun.pdfview.action.PDFAction;
0036:
0037: /**
0038: * An encapsulation of a .pdf file. The methods of this class
0039: * can parse the contents of a PDF file, but those methods are
0040: * hidden. Instead, the public methods of this class allow
0041: * access to the pages in the PDF file. Typically, you create
0042: * a new PDFFile, ask it for the number of pages, and then
0043: * request one or more PDFPages.
0044: * @author Mike Wessler
0045: */
0046: public class PDFFile {
0047: /** the end of line character */
0048: String eol = "\n";
0049:
0050: /**
0051: * A ByteBuffer containing the file data
0052: */
0053: ByteBuffer buf;
0054:
0055: /**
0056: * the cross reference table mapping object numbers to locations
0057: * in the PDF file
0058: */
0059: PDFXref[] objIdx;
0060:
0061: /** the root PDFObject, as specified in the PDF file */
0062: PDFObject root = null;
0063:
0064: /** the Encrypt PDFObject, from the trailer */
0065: PDFObject encrypt = null;
0066:
0067: /** a mapping of page numbers to parsed PDF commands */
0068: Cache cache;
0069:
0070: /**
0071: * whether the file is printable or not (trailer -> Encrypt -> P & 0x4)
0072: */
0073: private boolean printable = true;
0074:
0075: /**
0076: * whether the file is saveable or not (trailer -> Encrypt -> P & 0x10)
0077: */
0078: private boolean saveable = true;
0079:
0080: /**
0081: * get a PDFFile from a .pdf file. The file must me a random access file
0082: * at the moment. It should really be a file mapping from the nio package.
0083: * <p>
0084: * Use the getPage(...) methods to get a page from the PDF file.
0085: * @param buf the RandomAccessFile containing the PDF.
0086: */
0087: public PDFFile(ByteBuffer buf) throws IOException {
0088: this .buf = buf;
0089:
0090: cache = new Cache();
0091:
0092: parseFile();
0093: }
0094:
0095: /**
0096: * Gets whether the owner of the file has given permission to print
0097: * the file.
0098: * @return true if it is okay to print the file
0099: */
0100: public boolean isPrintable() {
0101: return printable;
0102: }
0103:
0104: /**
0105: * Gets whether the owner of the file has given permission to save
0106: * a copy of the file.
0107: * @return true if it is okay to save the file
0108: */
0109: public boolean isSaveable() {
0110: return saveable;
0111: }
0112:
0113: /**
0114: * get the root PDFObject of this PDFFile. You generally shouldn't need
0115: * this, but we've left it open in case you want to go spelunking.
0116: */
0117: public PDFObject getRoot() {
0118: return root;
0119: }
0120:
0121: /**
0122: * return the number of pages in this PDFFile. The pages will be
0123: * numbered from 1 to getNumPages(), inclusive.
0124: */
0125: public int getNumPages() {
0126: try {
0127: return root.getDictRef("Pages").getDictRef("Count")
0128: .getIntValue();
0129: } catch (IOException ioe) {
0130: return 0;
0131: }
0132: }
0133:
0134: /**
0135: * Used internally to track down PDFObject references. You should never
0136: * need to call this.
0137: * <p>
0138: * Since this is the only public method for tracking down PDF objects,
0139: * it is synchronized. This means that the PDFFile can only hunt down
0140: * one object at a time, preventing the file's location from getting
0141: * messed around.
0142: * <p>
0143: * This call stores the current buffer position before any changes are made
0144: * and restores it afterwards, so callers need not know that the position
0145: * has changed.
0146: *
0147: */
0148: public synchronized PDFObject dereference(PDFXref ref)
0149: throws IOException {
0150: int id = ref.getID();
0151:
0152: // make sure the id is valid and has been read
0153: if (id >= objIdx.length || objIdx[id] == null) {
0154: return PDFObject.nullObj;
0155: }
0156:
0157: // check to see if this is already dereferenced
0158: PDFObject obj = objIdx[id].getObject();
0159: if (obj != null) {
0160: return obj;
0161: }
0162:
0163: int loc = objIdx[id].getFilePos();
0164: if (loc < 0) {
0165: return PDFObject.nullObj;
0166: }
0167:
0168: // store the current position in the buffer
0169: int startPos = buf.position();
0170:
0171: // move to where this object is
0172: buf.position(loc);
0173:
0174: // read the object and cache the reference
0175: obj = readObject();
0176: if (obj == null) {
0177: obj = PDFObject.nullObj;
0178: }
0179:
0180: objIdx[id].setObject(obj);
0181:
0182: // reset to the previous position
0183: buf.position(startPos);
0184:
0185: return obj;
0186: }
0187:
0188: /**
0189: * Is the argument a white space character according to the PDF spec?
0190: */
0191: public static boolean isWhiteSpace(int c) {
0192: return (c == ' ' || c == '\t' || c == '\r' || c == '\n'
0193: || c == 0 || c == 12);
0194: // 0=nul, 12=ff
0195: }
0196:
0197: /**
0198: * Is the argument a delimiter according to the PDF spec?
0199: */
0200: public static boolean isDelimiter(int c) {
0201: return (c == '(' || c == ')' || c == '{' || c == '}'
0202: || c == '[' || c == ']' || c == '/' || c == '<'
0203: || c == '>' || c == '%' || isWhiteSpace(c));
0204: }
0205:
0206: /**
0207: * read the next object from the file
0208: */
0209: private PDFObject readObject() throws IOException {
0210: return readObject(false);
0211: }
0212:
0213: /**
0214: * read the next object with a special catch for numbers
0215: * @param numscan if true, don't bother trying to see if a number
0216: * is part of a "241 43 R" type of object reference.
0217: * @return the next PDFObject in the file
0218: */
0219: private PDFObject readObject(boolean numscan) throws IOException {
0220: // skip whitespace
0221: int c;
0222: PDFObject obj = null;
0223: while (obj == null) {
0224: while (isWhiteSpace(c = buf.get())) {
0225: }
0226: // check character for special punctuation:
0227: if (c == '<') {
0228: // could be start of <hex data>, or start of <<dictionary>>
0229: c = buf.get();
0230: if (c == '<') {
0231: // it's a dictionary
0232: obj = readDictionary();
0233: } else {
0234: buf.position(buf.position() - 1);
0235: obj = readHexString();
0236: }
0237: } else if (c == '(') {
0238: // it's a string
0239: obj = readString();
0240: } else if (c == '[') {
0241: // it's an array
0242: obj = readArray();
0243: } else if (c == '/') {
0244: // it's a name
0245: obj = readName();
0246: } else if (c == '%') {
0247: // it's a comment
0248: readLine();
0249: } else if ((c >= '0' && c <= '9') || c == '-' || c == '+'
0250: || c == '.') {
0251: // it's a number
0252: obj = readNumber((char) c);
0253: if (!numscan) {
0254: // It could be the start of a reference.
0255: // Check to see if there's another number, then "R".
0256: //
0257: // We can't use mark/reset, since this could be called
0258: // from dereference, which already is using a mark
0259: int startPos = buf.position();
0260:
0261: PDFObject testnum = readObject(true);
0262: if (testnum != null
0263: && testnum.getType() == PDFObject.NUMBER) {
0264: PDFObject testR = readObject(true);
0265: if (testR != null
0266: && testR.getType() == PDFObject.KEYWORD
0267: && testR.getStringValue().equals("R")) {
0268: // yup. it's a reference.
0269: PDFXref xref = new PDFXref(obj
0270: .getIntValue(), testnum
0271: .getIntValue());
0272: // Create a placeholder that will be dereferenced
0273: // as needed
0274: obj = new PDFObject(this , xref);
0275: } else if (testR != null
0276: && testR.getType() == PDFObject.KEYWORD
0277: && testR.getStringValue().equals("obj")) {
0278: // it's an object description
0279: obj = readObjectDescription();
0280: } else {
0281: buf.position(startPos);
0282: }
0283: } else {
0284: buf.position(startPos);
0285: }
0286: }
0287: } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
0288: // it's a keyword
0289: obj = readKeyword((char) c);
0290: } else {
0291: // it's probably a closing character.
0292: // throwback
0293: buf.position(buf.position() - 1);
0294: break;
0295: }
0296: }
0297: return obj;
0298: }
0299:
0300: /**
0301: * requires the next few characters (after whitespace) to match the
0302: * argument.
0303: * @param match the next few characters after any whitespace that
0304: * must be in the file
0305: * @return true if the next characters match; false otherwise.
0306: */
0307: private boolean nextItemIs(String match) throws IOException {
0308: // skip whitespace
0309: int c;
0310: while (isWhiteSpace(c = buf.get())) {
0311: }
0312: for (int i = 0; i < match.length(); i++) {
0313: if (i > 0) {
0314: c = buf.get();
0315: }
0316: if (c != match.charAt(i)) {
0317: return false;
0318: }
0319: }
0320: return true;
0321: }
0322:
0323: /**
0324: * read an entire << dictionary >>. The initial
0325: * << has already been read.
0326: * @return the Dictionary as a PDFObject.
0327: */
0328: private PDFObject readDictionary() throws IOException {
0329: HashMap hm = new HashMap();
0330: // we've already read the <<. Now get /Name obj pairs until >>
0331: PDFObject name;
0332: while ((name = readObject()) != null) {
0333: // make sure first item is a NAME
0334: if (name.getType() != PDFObject.NAME) {
0335: throw new PDFParseException(
0336: "First item in dictionary must be a /Name. (Was "
0337: + name + ")");
0338: }
0339: PDFObject value = readObject();
0340: if (value != null) {
0341: hm.put(name.getStringValue(), value);
0342: }
0343: }
0344: // System.out.println("End of dictionary at location "+raf.getFilePointer());
0345: if (!nextItemIs(">>")) {
0346: throw new PDFParseException("End of dictionary wasn't '>>'");
0347: }
0348: // System.out.println("Dictionary closed at location "+raf.getFilePointer());
0349: return new PDFObject(this , PDFObject.DICTIONARY, hm);
0350: }
0351:
0352: /**
0353: * read a character, and return its value as if it were a hexidecimal
0354: * digit.
0355: * @return a number between 0 and 15 whose value matches the next
0356: * hexidecimal character. Returns -1 if the next character isn't in
0357: * [0-9a-fA-F]
0358: */
0359: private int readHexDigit() throws IOException {
0360: int a;
0361: while (isWhiteSpace(a = buf.get())) {
0362: }
0363: if (a >= '0' && a <= '9') {
0364: a -= '0';
0365: } else if (a >= 'a' && a <= 'f') {
0366: a -= 'a' - 10;
0367: } else if (a >= 'A' && a <= 'F') {
0368: a -= 'A' - 10;
0369: } else {
0370: a = -1;
0371: }
0372: return a;
0373: }
0374:
0375: /**
0376: * return the 8-bit value represented by the next two hex characters.
0377: * If the next two characters don't represent a hex value, return -1
0378: * and reset the read head. If there is only one hex character,
0379: * return its value as if there were an implicit 0 after it.
0380: */
0381: private int readHexPair() throws IOException {
0382: int first = readHexDigit();
0383: if (first < 0) {
0384: buf.position(buf.position() - 1);
0385: return -1;
0386: }
0387: int second = readHexDigit();
0388: if (second < 0) {
0389: buf.position(buf.position() - 1);
0390: return (first << 4);
0391: } else {
0392: return (first << 4) + second;
0393: }
0394: }
0395:
0396: /**
0397: * read a < hex string >. The initial < has already been read.
0398: */
0399: private PDFObject readHexString() throws IOException {
0400: // we've already read the <. Now get the hex bytes until >
0401: int val;
0402: StringBuffer sb = new StringBuffer();
0403: while ((val = readHexPair()) >= 0) {
0404: sb.append((char) val);
0405: }
0406: if (buf.get() != '>') {
0407: throw new PDFParseException("Bad character in Hex String");
0408: }
0409: return new PDFObject(this , PDFObject.STRING, unicode(sb
0410: .toString()));
0411: }
0412:
0413: /**
0414: * take a string and determine if it is unicode by looking at the lead
0415: * characters, and that the string must be a multiple of 2 chars long.
0416: * Convert a unicoded string's characters into the true unicode.
0417: *
0418: * @param input
0419: * @return
0420: */
0421: private String unicode(String input) {
0422: // determine if we have unicode, if so, translate it
0423: if (input.length() < 2 || (input.length() % 2) != 0) {
0424: return input;
0425: }
0426: int c0 = input.charAt(0) & 0xFF;
0427: int c1 = input.charAt(1) & 0xFF;
0428: if ((c0 == 0xFE && c1 == 0xFF) || (c0 == 0xFF && c1 == 0xFE)) {
0429: // we have unicode
0430: boolean bigEndian = (input.charAt(1) == 0xFFFF);
0431: StringBuffer out = new StringBuffer();
0432: for (int i = 2; i < input.length(); i += 2) {
0433: if (bigEndian) {
0434: out
0435: .append((char) (((input.charAt(i + 1) & 0xFF) << 8) + (input
0436: .charAt(i) & 0xFF)));
0437: } else {
0438: out
0439: .append((char) (((input.charAt(i) & 0xFF) << 8) + (input
0440: .charAt(i + 1) & 0xFF)));
0441: }
0442: }
0443: return out.toString();
0444: } else {
0445: return input;
0446: }
0447: }
0448:
0449: /**
0450: * <p>read a ( character string ). The initial ( has already been read.
0451: * Read until a *balanced* ) appears.</p>
0452: *
0453: * <p>PDF Reference Section 3.8.1, Table 3.31 "PDF Data Types" defines
0454: * String data as:<pre>
0455: * "text string Bytes that represent characters encoded
0456: * using either PDFDocEncoding or UTF-16BE with a
0457: * leading byte-order marker (as defined in
0458: * “Text String Type� on page 158.)
0459: * </pre></p>
0460: *
0461: * <p>Section 5.3.2 defines character sequences and escapes.<br>
0462: * "The strings must conform to the syntax for string objects.
0463: * When a string is written by enclosing the data in parentheses,
0464: * bytes whose values are the same as those of the ASCII characters
0465: * left parenthesis (40), right parenthesis (41), and backslash (92)
0466: * must be preceded by a backslash character. All other byte values
0467: * between 0 and 255 may be used in a string object. <br>
0468: * These rules apply to each individual byte in a string object,
0469: * whether the string is interpreted by the text-showing operators
0470: * as single-byte or multiple-byte character codes."</p>
0471: */
0472: private PDFObject readString() throws IOException {
0473: int c;
0474:
0475: // we've already read the (. now get the characters until a
0476: // *balanced* ) appears. Translate \r \n \t \b \f \( \) \\ \ddd
0477: // if a cr/lf follows a backslash, ignore the cr/lf
0478: int parencount = 1;
0479: StringBuffer sb = new StringBuffer();
0480:
0481: while (parencount > 0) {
0482: c = buf.get() & 0xFF;
0483: // process unescaped parenthesis
0484: if (c == '(') {
0485: parencount++;
0486: } else if (c == ')') {
0487: parencount--;
0488: if (parencount == 0) {
0489: c = -1;
0490: break;
0491: }
0492: } else if (c == '\\') {
0493: // time to do some work
0494: c = buf.get() & 0xFF;
0495: if (c == 'r') {
0496: c = '\r';
0497: } else if (c == 'n') {
0498: c = '\n';
0499: } else if (c == 't') {
0500: c = '\t';
0501: } else if (c == 'b') {
0502: c = '\b';
0503: } else if (c == 'f') {
0504: c = '\f';
0505: }
0506: if (c == '\r') {
0507: // check for following \n
0508: c = buf.get() & 0xFF;
0509: if (c != '\n') {
0510: buf.position(buf.position() - 1);
0511: }
0512: c = -1;
0513: } else if (c == '\n') {
0514: c = -1;
0515: } else if (c >= '0' && c <= '9') {
0516: int count = 0;
0517: int val = 0;
0518: while (c >= '0' && c <= '9' && count < 3) {
0519: val = val * 8 + c - '0';
0520: c = buf.get() & 0xFF;
0521: count++;
0522: }
0523: buf.position(buf.position() - 1);
0524: c = val;
0525: }
0526: }
0527: if (c >= 0) {
0528: sb.append((char) c);
0529: }
0530: }
0531: return new PDFObject(this , PDFObject.STRING, unicode(sb
0532: .toString()));
0533: }
0534:
0535: /**
0536: * Read a line of text. This follows the semantics of readLine() in
0537: * DataInput -- it reads character by character until a '/n' is
0538: * encountered. If a '/r' is encountered, it is discarded.
0539: */
0540: private String readLine() {
0541: StringBuffer sb = new StringBuffer();
0542:
0543: while (buf.remaining() > 0) {
0544: char c = (char) buf.get();
0545:
0546: if (c == '\r') {
0547: if (buf.remaining() > 0) {
0548: char n = (char) buf.get(buf.position());
0549: if (n == '\n') {
0550: buf.get();
0551: }
0552: }
0553: break;
0554: } else if (c == '\n') {
0555: break;
0556: }
0557:
0558: sb.append(c);
0559: }
0560:
0561: return sb.toString();
0562: }
0563:
0564: /**
0565: * read an [ array ]. The initial [ has already been read. PDFObjects
0566: * are read until ].
0567: */
0568: private PDFObject readArray() throws IOException {
0569: // we've already read the [. Now read objects until ]
0570: ArrayList ary = new ArrayList();
0571: PDFObject obj;
0572: while ((obj = readObject()) != null) {
0573: ary.add(obj);
0574: }
0575: if (buf.get() != ']') {
0576: throw new PDFParseException("Array should end with ']'");
0577: }
0578: PDFObject[] objlist = new PDFObject[ary.size()];
0579: for (int i = 0; i < objlist.length; i++) {
0580: objlist[i] = (PDFObject) ary.get(i);
0581: }
0582: return new PDFObject(this , PDFObject.ARRAY, objlist);
0583: }
0584:
0585: /**
0586: * read a /name. The / has already been read.
0587: */
0588: private PDFObject readName() throws IOException {
0589: // we've already read the / that begins the name.
0590: // all we have to check for is #hh hex notations.
0591: StringBuffer sb = new StringBuffer();
0592: int c;
0593: while (!isDelimiter(c = buf.get())) {
0594: if (c == '#') {
0595: int hex = readHexPair();
0596: if (hex >= 0) {
0597: c = hex;
0598: } else {
0599: throw new PDFParseException("Bad #hex in /Name");
0600: }
0601: }
0602: sb.append((char) c);
0603: }
0604: buf.position(buf.position() - 1);
0605: return new PDFObject(this , PDFObject.NAME, sb.toString());
0606: }
0607:
0608: /**
0609: * read a number. The initial digit or . or - is passed in as the
0610: * argument.
0611: */
0612: private PDFObject readNumber(char start) throws IOException {
0613: // we've read the first digit (it's passed in as the argument)
0614: boolean neg = start == '-';
0615: boolean sawdot = start == '.';
0616: double dotmult = sawdot ? 0.1 : 1;
0617: double value = (start >= '0' && start <= '9') ? start - '0' : 0;
0618: while (true) {
0619: int c = buf.get();
0620: if (c == '.') {
0621: if (sawdot) {
0622: throw new PDFParseException(
0623: "Can't have two '.' in a number");
0624: }
0625: sawdot = true;
0626: dotmult = 0.1;
0627: } else if (c >= '0' && c <= '9') {
0628: int val = c - '0';
0629: if (sawdot) {
0630: value += val * dotmult;
0631: dotmult *= 0.1;
0632: } else {
0633: value = value * 10 + val;
0634: }
0635: } else {
0636: buf.position(buf.position() - 1);
0637: break;
0638: }
0639: }
0640: if (neg) {
0641: value = -value;
0642: }
0643: return new PDFObject(this , PDFObject.NUMBER, new Double(value));
0644: }
0645:
0646: /**
0647: * read a bare keyword. The initial character is passed in as the
0648: * argument.
0649: */
0650: private PDFObject readKeyword(char start) throws IOException {
0651: // we've read the first character (it's passed in as the argument)
0652: StringBuffer sb = new StringBuffer(String.valueOf(start));
0653: int c;
0654: while (!isDelimiter(c = buf.get())) {
0655: sb.append((char) c);
0656: }
0657: buf.position(buf.position() - 1);
0658: return new PDFObject(this , PDFObject.KEYWORD, sb.toString());
0659: }
0660:
0661: /**
0662: * read an entire PDFObject. The intro line, which looks something
0663: * like "4 0 obj" has already been read.
0664: */
0665: private PDFObject readObjectDescription() throws IOException {
0666: // we've already read the 4 0 obj bit. Next thing up is the object.
0667: // object descriptions end with the keyword endobj
0668: long debugpos = buf.position();
0669: PDFObject obj = readObject();
0670: // see if it's a dictionary. If so, this could be a stream.
0671: PDFObject endkey = readObject();
0672: if (endkey.getType() != PDFObject.KEYWORD) {
0673: throw new PDFParseException("Expected 'stream' or 'endobj'");
0674: }
0675: if (obj.getType() == PDFObject.DICTIONARY
0676: && endkey.getStringValue().equals("stream")) {
0677: // skip until we see \n
0678: readLine();
0679: ByteBuffer data = readStream(obj);
0680: if (data == null) {
0681: data = ByteBuffer.allocate(0);
0682: }
0683: obj.setStream(data);
0684: endkey = readObject();
0685: // if (endkey.getType()!=PDFObject.KEYWORD) {
0686: // System.out.println("WARNING! Object at "+debugpos+" didn't end with 'endobj'");
0687: //throw new PDFParseException("Object must end with 'endobj'");
0688: // }
0689: }
0690: // at this point, obj is the object, keyword should be "endobj"
0691: String endcheck = endkey.getStringValue();
0692: if (endcheck == null || !endcheck.equals("endobj")) {
0693: System.out.println("WARNING: object at " + debugpos
0694: + " didn't end with 'endobj'");
0695: //throw new PDFParseException("Object musst end with 'endobj'");
0696: }
0697: return obj;
0698: }
0699:
0700: /**
0701: * read the stream portion of a PDFObject. Calls decodeStream to
0702: * un-filter the stream as necessary.
0703: *
0704: * @param dict the dictionary associated with this stream.
0705: * @return a ByteBuffer with the encoded stream data
0706: */
0707: private ByteBuffer readStream(PDFObject dict) throws IOException {
0708: // pointer is at the start of a stream. read the stream and
0709: // decode, based on the entries in the dictionary
0710: PDFObject lengthObj = dict.getDictRef("Length");
0711: int length = -1;
0712: if (lengthObj != null) {
0713: length = lengthObj.getIntValue();
0714: }
0715: if (length < 0) {
0716: throw new PDFParseException("Unknown length for stream");
0717: }
0718:
0719: // slice the data
0720: int start = buf.position();
0721: ByteBuffer streamBuf = buf.slice();
0722: streamBuf.limit(length);
0723:
0724: // move the current position to the end of the data
0725: buf.position(buf.position() + length);
0726: int ending = buf.position();
0727:
0728: if (!nextItemIs("endstream")) {
0729: System.out.println("read " + length + " chars from "
0730: + start + " to " + ending);
0731: throw new PDFParseException("Stream ended inappropriately");
0732: }
0733:
0734: return streamBuf;
0735: // now decode stream
0736: // return PDFDecoder.decodeStream(dict, streamBuf);
0737: }
0738:
0739: /**
0740: * read the cross reference table from a PDF file. When this method
0741: * is called, the file pointer must point to the start of the word
0742: * "xref" in the file. Reads the xref table and the trailer dictionary.
0743: * If dictionary has a /Prev entry, move file pointer
0744: * and read new trailer
0745: */
0746: private void readTrailer() throws IOException {
0747: // the table of xrefs
0748: objIdx = new PDFXref[50];
0749:
0750: // read a bunch of nester trailer tables
0751: while (true) {
0752: // make sure we are looking at an xref table
0753: if (!nextItemIs("xref")) {
0754: throw new PDFParseException(
0755: "Expected 'xref' at start of table");
0756: }
0757:
0758: // read a bunch of linked tabled
0759: while (true) {
0760: // read until the word "trailer"
0761: PDFObject obj = readObject();
0762: if (obj.getType() == PDFObject.KEYWORD
0763: && obj.getStringValue().equals("trailer")) {
0764: break;
0765: }
0766:
0767: // read the starting position of the reference
0768: if (obj.getType() != PDFObject.NUMBER) {
0769: throw new PDFParseException(
0770: "Expected number for first xref entry");
0771: }
0772: int refstart = obj.getIntValue();
0773:
0774: // read the size of the reference table
0775: obj = readObject();
0776: if (obj.getType() != PDFObject.NUMBER) {
0777: throw new PDFParseException(
0778: "Expected number for length of xref table");
0779: }
0780: int reflen = obj.getIntValue();
0781:
0782: // skip a line
0783: readLine();
0784:
0785: // extend the objIdx table, if necessary
0786: if (refstart + reflen >= objIdx.length) {
0787: PDFXref nobjIdx[] = new PDFXref[refstart + reflen];
0788: System.arraycopy(objIdx, 0, nobjIdx, 0,
0789: objIdx.length);
0790: objIdx = nobjIdx;
0791: }
0792:
0793: // read reference lines
0794: for (int refID = refstart; refID < refstart + reflen; refID++) {
0795: // each reference line is 20 bytes long
0796: byte[] refline = new byte[20];
0797: buf.get(refline);
0798:
0799: // ignore this line if the object ID is already defined
0800: if (objIdx[refID] != null) {
0801: continue;
0802: }
0803:
0804: // see if it's an active object
0805: if (refline[17] == 'n') {
0806: objIdx[refID] = new PDFXref(refline);
0807: } else {
0808: objIdx[refID] = new PDFXref(null);
0809: }
0810: }
0811: }
0812:
0813: // at this point, the "trailer" word (not EOL) has been read.
0814: PDFObject trailerdict = readObject();
0815: if (trailerdict.getType() != PDFObject.DICTIONARY) {
0816: throw new IOException(
0817: "Expected dictionary after \"trailer\"");
0818: }
0819:
0820: // read the root object location
0821: if (root == null) {
0822: root = trailerdict.getDictRef("Root");
0823: }
0824:
0825: // read the encryption information
0826: if (encrypt == null) {
0827: encrypt = trailerdict.getDictRef("Encrypt");
0828: }
0829:
0830: // read the location of the previous xref table
0831: PDFObject prevloc = trailerdict.getDictRef("Prev");
0832: if (prevloc != null) {
0833: buf.position(prevloc.getIntValue());
0834: } else {
0835: break;
0836: }
0837: }
0838:
0839: // make sure we found a root
0840: if (root == null) {
0841: throw new PDFParseException(
0842: "No /Root key found in trailer dictionary");
0843: }
0844:
0845: // check what permissions are relevant
0846: if (encrypt != null) {
0847: PDFObject permissions = encrypt.getDictRef("P");
0848: if (permissions != null) {
0849: int perms = permissions.getIntValue();
0850: if ((perms & 4) == 0) {
0851: printable = false;
0852: }
0853: if ((perms & 16) == 0) {
0854: saveable = false;
0855: }
0856: }
0857: }
0858:
0859: // dereference the root object
0860: root.dereference();
0861: }
0862:
0863: /**
0864: * build the PDFFile reference table. Nothing in the PDFFile actually
0865: * gets parsed, despite the name of this function. Things only get
0866: * read and parsed when they're needed.
0867: */
0868: private void parseFile() throws IOException {
0869: // start at the begining of the file
0870: buf.rewind();
0871:
0872: // back up about 32 characters from the end of the file to find
0873: // startxref\n
0874: byte[] scan = new byte[32];
0875: int scanPos = buf.remaining() - scan.length;
0876: int loc = 0;
0877:
0878: while (scanPos >= 0) {
0879: buf.position(scanPos);
0880: buf.get(scan);
0881:
0882: // find startxref in scan
0883: String scans = new String(scan);
0884: loc = scans.indexOf("startxref");
0885: if (loc > 0) {
0886: if (scanPos + loc + scan.length <= buf.limit()) {
0887: scanPos = scanPos + loc;
0888: loc = 0;
0889: }
0890:
0891: break;
0892: }
0893: scanPos -= scan.length - 10;
0894: }
0895:
0896: if (scanPos < 0) {
0897: throw new IOException("This may not be a PDF File");
0898: }
0899:
0900: buf.position(scanPos);
0901: buf.get(scan);
0902: String scans = new String(scan);
0903:
0904: loc += 10; // skip over "startxref" and first EOL char
0905: if (scans.charAt(loc) < 32) {
0906: loc++;
0907: } // skip over possible 2nd EOL char
0908: // read number
0909: int numstart = loc;
0910: while (loc < scans.length() && scans.charAt(loc) >= '0'
0911: && scans.charAt(loc) <= '9') {
0912: loc++;
0913: }
0914: int xrefpos = Integer.parseInt(scans.substring(numstart, loc));
0915: buf.position(xrefpos);
0916:
0917: readTrailer();
0918: }
0919:
0920: /**
0921: * Gets the outline tree as a tree of OutlineNode, which is a subclass
0922: * of DefaultMutableTreeNode. If there is no outline tree, this method
0923: * returns null.
0924: */
0925: public OutlineNode getOutline() throws IOException {
0926: // find the outlines entry in the root object
0927: PDFObject oroot = root.getDictRef("Outlines");
0928: OutlineNode work = null;
0929: OutlineNode outline = null;
0930: if (oroot != null) {
0931: // find the first child of the outline root
0932: PDFObject scan = oroot.getDictRef("First");
0933: outline = work = new OutlineNode("<top>");
0934:
0935: // scan each sibling in turn
0936: while (scan != null) {
0937: // add the new node with it's name
0938: String title = scan.getDictRef("Title")
0939: .getStringValue();
0940: OutlineNode build = new OutlineNode(title);
0941: work.add(build);
0942:
0943: // find the action
0944: PDFAction action = null;
0945:
0946: PDFObject actionObj = scan.getDictRef("A");
0947: if (actionObj != null) {
0948: action = PDFAction.getAction(actionObj, getRoot());
0949: } else {
0950: // try to create an action from a destination
0951: PDFObject destObj = scan.getDictRef("Dest");
0952: if (destObj != null) {
0953: try {
0954: PDFDestination dest = PDFDestination
0955: .getDestination(destObj, getRoot());
0956:
0957: action = new GoToAction(dest);
0958: } catch (IOException ioe) {
0959: // oh well
0960: }
0961: }
0962: }
0963:
0964: // did we find an action? If so, add it
0965: if (action != null) {
0966: build.setAction(action);
0967: }
0968:
0969: // find the first child of this node
0970: PDFObject kid = scan.getDictRef("First");
0971: if (kid != null) {
0972: work = build;
0973: scan = kid;
0974: } else {
0975: // no child. Process the next sibling
0976: PDFObject next = scan.getDictRef("Next");
0977: while (next == null) {
0978: scan = scan.getDictRef("Parent");
0979: next = scan.getDictRef("Next");
0980: work = (OutlineNode) work.getParent();
0981: if (work == null) {
0982: break;
0983: }
0984: }
0985: scan = next;
0986: }
0987: }
0988: }
0989:
0990: return outline;
0991: }
0992:
0993: /**
0994: * Gets the page number (starting from 1) of the page represented by
0995: * a particular PDFObject. The PDFObject must be a Page dictionary or
0996: * a destination description (or an action).
0997: * @return a number between 1 and the number of pages indicating the
0998: * page number, or 0 if the PDFObject is not in the page tree.
0999: */
1000: public int getPageNumber(PDFObject page) throws IOException {
1001: if (page.getType() == PDFObject.ARRAY) {
1002: page = page.getAt(0);
1003: }
1004:
1005: // now we've got a page. Make sure.
1006: PDFObject typeObj = page.getDictRef("Type");
1007: if (typeObj == null || !typeObj.getStringValue().equals("Page")) {
1008: return 0;
1009: }
1010:
1011: int count = 0;
1012: while (true) {
1013: PDFObject parent = page.getDictRef("Parent");
1014: if (parent == null) {
1015: break;
1016: }
1017: PDFObject kids[] = parent.getDictRef("Kids").getArray();
1018: for (int i = 0; i < kids.length; i++) {
1019: if (kids[i].equals(page)) {
1020: break;
1021: } else {
1022: PDFObject kcount = kids[i].getDictRef("Count");
1023: if (kcount != null) {
1024: count += kcount.getIntValue();
1025: } else {
1026: count += 1;
1027: }
1028: }
1029: }
1030: page = parent;
1031: }
1032: return count;
1033: }
1034:
1035: /**
1036: * Get the page commands for a given page in a separate thread.
1037: *
1038: * @param pagenum the number of the page to get commands for
1039: */
1040: public PDFPage getPage(int pagenum) {
1041: return getPage(pagenum, false);
1042: }
1043:
1044: /**
1045: * Get the page commands for a given page.
1046: *
1047: * @param pagenum the number of the page to get commands for
1048: * @param wait if true, do not exit until the page is complete.
1049: */
1050: public PDFPage getPage(int pagenum, boolean wait) {
1051: Integer key = new Integer(pagenum);
1052: HashMap resources = null;
1053: PDFObject pageObj = null;
1054: boolean needread = false;
1055:
1056: PDFPage page = cache.getPage(key);
1057: PDFParser parser = cache.getPageParser(key);
1058: if (page == null) {
1059: try {
1060: // hunt down the page!
1061: resources = new HashMap();
1062:
1063: PDFObject topPagesObj = root.getDictRef("Pages");
1064: pageObj = findPage(topPagesObj, 0, pagenum, resources);
1065:
1066: if (pageObj == null) {
1067: return null;
1068: }
1069:
1070: page = createPage(pagenum, pageObj);
1071:
1072: byte[] stream = getContents(pageObj);
1073: parser = new PDFParser(page, stream, resources);
1074:
1075: cache.addPage(key, page, parser);
1076: } catch (IOException ioe) {
1077: System.out.println("GetPage inner loop:");
1078: ioe.printStackTrace();
1079: return null;
1080: }
1081: }
1082:
1083: if (parser != null && !parser.isFinished()) {
1084: parser.go(wait);
1085: }
1086:
1087: return page;
1088: }
1089:
1090: /**
1091: * Stop the rendering of a particular image on this page
1092: */
1093: public void stop(int pageNum) {
1094: PDFParser parser = cache.getPageParser(new Integer(pageNum));
1095: if (parser != null) {
1096: // stop it
1097: parser.stop();
1098: }
1099: }
1100:
1101: /**
1102: * get the stream representing the content of a particular page.
1103: *
1104: * @param pageObj the page object to get the contents of
1105: * @return a concatenation of any content streams for the requested
1106: * page.
1107: */
1108: private byte[] getContents(PDFObject pageObj) throws IOException {
1109: // concatenate all the streams
1110: PDFObject contentsObj = pageObj.getDictRef("Contents");
1111: if (contentsObj == null) {
1112: throw new IOException("No page contents!");
1113: }
1114:
1115: PDFObject contents[] = contentsObj.getArray();
1116:
1117: // see if we have only one stream (the easy case)
1118: if (contents.length == 1) {
1119: return contents[0].getStream();
1120: }
1121:
1122: // first get the total length of all the streams
1123: int len = 0;
1124: for (int i = 0; i < contents.length; i++) {
1125: byte[] data = contents[i].getStream();
1126: if (data == null) {
1127: throw new PDFParseException("No stream on content " + i
1128: + ": " + contents[i]);
1129: }
1130: len += data.length;
1131: }
1132:
1133: // now assemble them all into one object
1134: byte[] stream = new byte[len];
1135: len = 0;
1136: for (int i = 0; i < contents.length; i++) {
1137: byte data[] = contents[i].getStream();
1138: System.arraycopy(data, 0, stream, len, data.length);
1139: len += data.length;
1140: }
1141:
1142: return stream;
1143: }
1144:
1145: /**
1146: * Create a PDF Page object by finding the relevant inherited
1147: * properties
1148: *
1149: * @param pageObj the PDF object for the page to be created
1150: */
1151: private PDFPage createPage(int pagenum, PDFObject pageObj)
1152: throws IOException {
1153: int rotation = 0;
1154: Rectangle2D mediabox = null; // second choice, if no crop
1155: Rectangle2D cropbox = null; // first choice
1156:
1157: PDFObject mediaboxObj = getInheritedValue(pageObj, "MediaBox");
1158: if (mediaboxObj != null) {
1159: mediabox = parseRect(mediaboxObj);
1160: }
1161:
1162: PDFObject cropboxObj = getInheritedValue(pageObj, "CropBox");
1163: if (cropboxObj != null) {
1164: cropbox = parseRect(cropboxObj);
1165: }
1166:
1167: PDFObject rotateObj = getInheritedValue(pageObj, "Rotate");
1168: if (rotateObj != null) {
1169: rotation = rotateObj.getIntValue();
1170: }
1171:
1172: Rectangle2D bbox = ((cropbox == null) ? mediabox : cropbox);
1173:
1174: return new PDFPage(pagenum, bbox, rotation, cache);
1175: }
1176:
1177: /**
1178: * Get the PDFObject representing the content of a particular page. Note
1179: * that the number of the page need not have anything to do with the
1180: * label on that page. If there are two blank pages, and then roman
1181: * numerals for the page number, then passing in 6 will get page (iv).
1182: *
1183: * @param pagedict the top of the pages tree
1184: * @param start the page number of the first page in this dictionary
1185: * @param getPage the number of the page to find; NOT the page's label.
1186: * @param resources a HashMap that will be filled with any resource
1187: * definitions encountered on the search for the page
1188: */
1189: private PDFObject findPage(PDFObject pagedict, int start,
1190: int getPage, Map resources) throws IOException {
1191: PDFObject rsrcObj = pagedict.getDictRef("Resources");
1192: if (rsrcObj != null) {
1193: resources.putAll(rsrcObj.getDictionary());
1194: }
1195:
1196: PDFObject typeObj = pagedict.getDictRef("Type");
1197: if (typeObj != null && typeObj.getStringValue().equals("Page")) {
1198: // we found our page!
1199: return pagedict;
1200: }
1201:
1202: // find the first child for which (start + count) > getPage
1203: PDFObject kidsObj = pagedict.getDictRef("Kids");
1204: if (kidsObj != null) {
1205: PDFObject[] kids = kidsObj.getArray();
1206: for (int i = 0; i < kids.length; i++) {
1207: int count = 1;
1208: // BUG: some PDFs (T1Format.pdf) don't have the Type tag.
1209: // use the Count tag to indicate a Pages dictionary instead.
1210: PDFObject countItem = kids[i].getDictRef("Count");
1211: // if (kids[i].getDictRef("Type").getStringValue().equals("Pages")) {
1212: if (countItem != null) {
1213: count = countItem.getIntValue();
1214: }
1215:
1216: if (start + count >= getPage) {
1217: return findPage(kids[i], start, getPage, resources);
1218: }
1219:
1220: start += count;
1221: }
1222: }
1223:
1224: return null;
1225: }
1226:
1227: /**
1228: * Find a property value in a page that may be inherited. If the value
1229: * is not defined in the page itself, follow the page's "parent" links
1230: * until the value is found or the top of the tree is reached.
1231: *
1232: * @param pageObj the object representing the page
1233: * @param propName the name of the property we are looking for
1234: */
1235: private PDFObject getInheritedValue(PDFObject pageObj,
1236: String propName) throws IOException {
1237: // see if we have the property
1238: PDFObject propObj = pageObj.getDictRef(propName);
1239: if (propObj != null) {
1240: return propObj;
1241: }
1242:
1243: // recursively see if any of our parent have it
1244: PDFObject parentObj = pageObj.getDictRef("Parent");
1245: if (parentObj != null) {
1246: return getInheritedValue(parentObj, propName);
1247: }
1248:
1249: // no luck
1250: return null;
1251: }
1252:
1253: /**
1254: * get a Rectangle2D.Float representation for a PDFObject that is an
1255: * array of four Numbers.
1256: * @param obj a PDFObject that represents an Array of exactly four
1257: * Numbers.
1258: */
1259: public Rectangle2D.Float parseRect(PDFObject obj)
1260: throws IOException {
1261: if (obj.getType() == PDFObject.ARRAY) {
1262: PDFObject bounds[] = obj.getArray();
1263: if (bounds.length == 4) {
1264: return new Rectangle2D.Float(bounds[0].getFloatValue(),
1265: bounds[1].getFloatValue(), bounds[2]
1266: .getFloatValue()
1267: - bounds[0].getFloatValue(), bounds[3]
1268: .getFloatValue()
1269: - bounds[1].getFloatValue());
1270: } else {
1271: throw new PDFParseException(
1272: "Rectangle definition didn't have 4 elements");
1273: }
1274: } else {
1275: throw new PDFParseException(
1276: "Rectangle definition not an array");
1277: }
1278: }
1279: }
|