001: package com.etymon.pj;
002:
003: import java.io.*;
004: import java.util.*;
005: import com.etymon.pj.exception.*;
006: import com.etymon.pj.object.*;
007:
008: public class PdfParser {
009:
010: public static void getObjects(Pdf pdf, RandomAccessFile raf)
011: throws IOException, PjException {
012: long[][] xref = getXref(pdf, raf);
013: byte[] data;
014: PjObject object;
015: Hashtable ht = new Hashtable();
016: for (int x = 0; x < xref.length; x++) {
017: if (xref[x][2] == 1) {
018: raf.seek(xref[x][0]);
019: data = readUntil(raf, "endobj");
020: object = PdfParser.parse(pdf, raf, xref, data, 0);
021: pdf._objects.setObjectAt(object, x);
022: }
023: }
024: }
025:
026: private static PjObject getObject(Pdf pdf, RandomAccessFile raf,
027: long[][] xref, int num) throws IOException, PjException {
028: // check if the object has been loaded
029: PjObject obj = pdf._objects.objectAt(num);
030: if (obj != null) {
031: return obj;
032: }
033: // otherwise we have to load it
034: raf.seek(xref[num][0]);
035: byte[] data = readUntil(raf, "endobj");
036: obj = PdfParser.parse(pdf, raf, xref, data, 0);
037: pdf._objects.setObjectAt(obj, num);
038: return obj;
039: }
040:
041: private static long[][] getXref(Pdf pdf, RandomAccessFile raf)
042: throws IOException, PjException {
043: // we assume that the cross-reference table as a whole
044: // (including all "sections") is contiguous in terms
045: // of object numbers; in other words, we assume that
046: // '/Size n' in the trailer dictionary indicates not
047: // only that n is the number of cross reference
048: // entries in the table, but also that (n-1) is the
049: // largest object number in use; this allow us to use
050: // a long[][] for storing the table, because we can
051: // allocate it as long[n][3]. I think this is
052: // implicit in the PDF spec but I couldn't find a
053: // clear statement about it. If it turns out that
054: // this is incorrect, we'll have to change all the
055: // code to use a Vector instead of an array.
056: long lastXref = getStartXref(raf);
057: return getNextXref(pdf, raf, lastXref, null);
058: }
059:
060: private static long[][] getNextXref(Pdf pdf, RandomAccessFile raf,
061: long start, long[][] xref) throws IOException, PjException {
062: raf.seek(start);
063: byte[] xrefData = readUntil(raf, "trailer");
064: byte[] trailerData = readUntil(raf, "startxref");
065: PjDictionary trailer = (PjDictionary) (PdfParser.parse(pdf,
066: raf, xref, trailerData, 0));
067: Hashtable h = trailer.getHashtable();
068: long[][] xr;
069: if (xref == null) {
070: xr = new long[((PjNumber) (h.get(new PjName("Size"))))
071: .getInt()][3];
072: pdf._trailer = h;
073: } else {
074: xr = xref;
075: }
076: // recursively collect previous xref data
077: PjNumber prev = (PjNumber) (h.get(new PjName("Prev")));
078: if (prev != null) {
079: xr = getNextXref(pdf, raf, prev.getLong(), xr);
080: }
081: // now overlay this xref data
082: PdfParser.parseXref(xrefData, xr, 0);
083: return xr;
084: }
085:
086: private static long getStartXref(RandomAccessFile raf)
087: throws IOException, PjException {
088: // locate startxref near the end of the file
089: int scan = 0;
090: for (int retry = PjConst.SCAN_STARTXREF_RETRY; retry > 0; retry--) {
091: scan = scan + PjConst.SCAN_STARTXREF;
092: long fileSize = raf.length();
093: raf.seek(fileSize - scan);
094: byte[] buffer = readUntil(raf, "startxref");
095: // next line should be the startxref value
096: buffer = readUntil(raf, "%%EOF");
097: if (buffer.length != 0) {
098: // now parse the long value from the buffer
099: StringBuffer sb = new StringBuffer();
100: boolean abort = false;
101: int x = 0;
102: while ((abort == false)
103: && (Character.isDigit((char) (buffer[x])))) {
104: sb.append((char) (buffer[x]));
105: x++;
106: if (x >= buffer.length) {
107: abort = true;
108: }
109: }
110: if (abort == false) {
111: return new Long(new String(sb)).longValue();
112: }
113: }
114: }
115: throw new StartxrefFormatException(
116: "Unexpected end of file (startxref).");
117: }
118:
119: public static byte[] readUntil(RandomAccessFile raf, String endstr)
120: throws IOException {
121: StringBuffer sb = new StringBuffer();
122: char c = '\0';
123: String s;
124: char[] compare = new char[endstr.length()];
125: char lastEol = '\0';
126: boolean eof = false;
127: boolean done = false;
128: do {
129: try {
130: c = (char) (raf.readUnsignedByte());
131: switch (lastEol) {
132: case '\0':
133: if ((c == '\r') || (c == '\n')) {
134: if (sb.length() >= endstr.length()) {
135: sb.getChars(sb.length() - endstr.length(),
136: sb.length(), compare, 0);
137: s = new String(compare);
138: if (s.equals(endstr)) {
139: lastEol = c;
140: }
141: }
142: }
143: sb.append(c);
144: break;
145: case '\n':
146: raf.seek(raf.getFilePointer() - 1);
147: done = true;
148: break;
149: case '\r':
150: if (c == '\n') {
151: sb.append(c);
152: } else {
153: raf.seek(raf.getFilePointer() - 1);
154: }
155: done = true;
156: break;
157: }
158: } catch (EOFException e) {
159: eof = true;
160: }
161: } while ((eof == false) && (done == false));
162: int y = sb.length();
163: byte[] buffer = new byte[y];
164: for (int x = 0; x < y; x++) {
165: buffer[x] = (byte) (sb.charAt(x));
166: }
167: return buffer;
168: }
169:
170: // deprecated
171: // RandomAccessFile.readLine() does not seem to work!
172: // this is a replacement, but it also discards the trailing
173: // '\r' and/or '\n'
174: protected static String readLine(RandomAccessFile raf)
175: throws IOException {
176: char c = '\0';
177: StringBuffer sb = new StringBuffer();
178: boolean endOfLine = false;
179: boolean endOfFile = false;
180: boolean startOfNext = false;
181: boolean firstChar = true;
182: do {
183: try {
184: c = (char) (raf.readUnsignedByte());
185: if ((c != '\r') && (c != '\n')) {
186: if (endOfLine) {
187: startOfNext = true;
188: } else {
189: sb.append(c);
190: }
191: } else {
192: endOfLine = true;
193: }
194: firstChar = false;
195: } catch (EOFException e) {
196: endOfFile = true;
197: }
198: } while ((endOfFile == false) && (startOfNext == false));
199: if (startOfNext) {
200: raf.seek(raf.getFilePointer() - 1);
201: }
202: if ((endOfFile) && (firstChar)) {
203: return null;
204: } else {
205: return sb.toString();
206: }
207: }
208:
209: public static void parseXref(byte[] data, long[][] xref, int start)
210: throws XrefFormatException {
211: PdfParserState state = new PdfParserState();
212: state._data = data;
213: state._pos = start;
214: getLine(state); // initial "xref"
215: if (state._token.equals("xref") == false) {
216: throw new XrefFormatException(
217: "Start of xref not found (xref).");
218: }
219: StringTokenizer st;
220: int index, count, x;
221: while (state._pos < state._data.length) {
222: getLine(state);
223: st = new StringTokenizer(state._token);
224: if (state._token.equals("trailer")) {
225: return;
226: }
227: index = Integer.parseInt(st.nextToken());
228: count = Integer.parseInt(st.nextToken());
229: for (x = 0; x < count; x++) {
230: getLine(state);
231: st = new StringTokenizer(state._token);
232: xref[index][0] = new Integer(st.nextToken())
233: .longValue();
234: xref[index][1] = new Integer(st.nextToken())
235: .longValue();
236: if (st.nextToken().equals("n")) {
237: xref[index][2] = 1;
238: } else {
239: xref[index][2] = 0;
240: }
241: index++;
242: }
243: }
244: }
245:
246: public static PjObject parse(Pdf pdf, RandomAccessFile raf,
247: long[][] xref, byte[] data, int start) throws IOException,
248: PjException {
249: PdfParserState state = new PdfParserState();
250: state._data = data;
251: state._pos = start;
252: state._stream = -1;
253: Stack stack = new Stack();
254: boolean endFlag = false;
255: while ((!endFlag) && (getToken(state))) {
256: if (state._stream != -1) {
257: stack.push(state._streamToken);
258: state._stream = -1;
259: } else if (state._token.equals("startxref")) {
260: endFlag = true;
261: } else if (state._token.equals("endobj")) {
262: endFlag = true;
263: } else if (state._token.equals("%%EOF")) {
264: endFlag = true;
265: } else if (state._token.equals("endstream")) {
266: byte[] stream = (byte[]) (stack.pop());
267: PjStreamDictionary pjsd = new PjStreamDictionary(
268: ((PjDictionary) (stack.pop())).getHashtable());
269: PjStream pjs = new PjStream(pjsd, stream);
270: stack.push(pjs);
271: } else if (state._token.equals("stream")) {
272: // get length of stream
273: PjObject obj = ((PjObject) ((((PjDictionary) (stack
274: .peek())).getHashtable().get(new PjName(
275: "Length")))));
276: if (obj instanceof PjReference) {
277: obj = getObject(pdf, raf, xref,
278: ((PjReference) (obj)).getObjNumber()
279: .getInt());
280: }
281: state._stream = ((PjNumber) (obj)).getInt();
282:
283: // the following if() clause added to
284: // handle the case of "Length" being
285: // incorrect (larger than the actual
286: // stream length)
287: if (state._stream > (state._data.length - state._pos)) {
288: state._stream = state._data.length - state._pos
289: - 17;
290: }
291:
292: if (state._pos < state._data.length) {
293: if ((char) (state._data[state._pos]) == '\r') {
294: state._pos++;
295: }
296: if ((state._pos < state._data.length)
297: && ((char) (state._data[state._pos]) == '\n')) {
298: state._pos++;
299: }
300: }
301: } else if (state._token.equals("null")) {
302: stack.push(new PjNull());
303: } else if (state._token.equals("true")) {
304: stack.push(new PjBoolean(true));
305: } else if (state._token.equals("false")) {
306: stack.push(new PjBoolean(false));
307: } else if (state._token.equals("R")) {
308: // we ignore the generation number
309: // because all objects get reset to
310: // generation 0 when we collapse the
311: // incremental updates
312: stack.pop(); // the generation number
313: PjNumber obj = (PjNumber) (stack.pop());
314: stack.push(new PjReference(obj, PjNumber.ZERO));
315: } else if ((state._token.charAt(0) == '<')
316: && (state._token.startsWith("<<") == false)) {
317: stack.push(new PjString(PjString
318: .decodePdf(state._token)));
319: } else if ((Character.isDigit(state._token.charAt(0)))
320: || (state._token.charAt(0) == '-')
321: || (state._token.charAt(0) == '.')) {
322: stack.push(new PjNumber(new Float(state._token)
323: .floatValue()));
324: } else if (state._token.charAt(0) == '(') {
325: stack.push(new PjString(PjString
326: .decodePdf(state._token)));
327: } else if (state._token.charAt(0) == '/') {
328: stack.push(new PjName(state._token.substring(1)));
329: } else if (state._token.equals(">>")) {
330: boolean done = false;
331: Object obj;
332: Hashtable h = new Hashtable();
333: while (!done) {
334: obj = stack.pop();
335: if ((obj instanceof String)
336: && (((String) obj).equals("<<"))) {
337: done = true;
338: } else {
339: h.put((PjName) (stack.pop()), (PjObject) obj);
340: }
341: }
342: // figure out what kind of dictionary we have
343: PjDictionary dictionary = new PjDictionary(h);
344: if (PjPage.isLike(dictionary)) {
345: stack.push(new PjPage(h));
346: } else if (PjPages.isLike(dictionary)) {
347: stack.push(new PjPages(h));
348: } else if (PjFontType1.isLike(dictionary)) {
349: stack.push(new PjFontType1(h));
350: } else if (PjFontDescriptor.isLike(dictionary)) {
351: stack.push(new PjFontDescriptor(h));
352: } else if (PjResources.isLike(dictionary)) {
353: stack.push(new PjResources(h));
354: } else if (PjCatalog.isLike(dictionary)) {
355: stack.push(new PjCatalog(h));
356: } else if (PjInfo.isLike(dictionary)) {
357: stack.push(new PjInfo(h));
358: } else if (PjEncoding.isLike(dictionary)) {
359: stack.push(new PjEncoding(h));
360: } else {
361: stack.push(dictionary);
362: }
363: } else if (state._token.equals("]")) {
364: boolean done = false;
365: Object obj;
366: Vector v = new Vector();
367: while (!done) {
368: obj = stack.pop();
369: if ((obj instanceof String)
370: && (((String) obj).equals("["))) {
371: done = true;
372: } else {
373: v.insertElementAt((PjObject) obj, 0);
374: }
375: }
376: // figure out what kind of array we have
377: PjArray array = new PjArray(v);
378: if (PjRectangle.isLike(array)) {
379: stack.push(new PjRectangle(v));
380: } else if (PjProcSet.isLike(array)) {
381: stack.push(new PjProcSet(v));
382: } else {
383: stack.push(array);
384: }
385: } else if (state._token.startsWith("%")) {
386: // do nothing
387: } else {
388: stack.push(state._token);
389: }
390: }
391: return (PjObject) (stack.pop());
392: }
393:
394: private static boolean getLine(PdfParserState state) {
395: StringBuffer sb = new StringBuffer();
396: char c;
397: while (state._pos < state._data.length) {
398: c = (char) (state._data[state._pos]);
399: state._pos++;
400: switch (c) {
401: case '\r':
402: if ((state._pos < state._data.length)
403: && ((char) (state._data[state._pos]) == '\n')) {
404: state._pos++;
405: }
406: case '\n':
407: state._token = sb.toString();
408: return true;
409: default:
410: sb.append(c);
411: }
412: }
413: return false;
414: }
415:
416: private static boolean getToken(PdfParserState state) {
417: if (state._stream != -1) {
418: state._streamToken = new byte[state._stream];
419: System.arraycopy(state._data, state._pos,
420: state._streamToken, 0, state._stream);
421: state._pos = state._pos + state._stream;
422: return true;
423: }
424: skipWhitespace(state);
425: StringBuffer sb = new StringBuffer();
426: boolean firstChar = true;
427: boolean string = false;
428: int stringParen = 0;
429: boolean hstring = false;
430: char c = '\0';
431: char last;
432: int x;
433: while (state._pos < state._data.length) {
434: last = c;
435: c = (char) (state._data[state._pos]);
436: state._pos++;
437: if (firstChar) {
438: switch (c) {
439: case '(':
440: string = true;
441: stringParen = 0;
442: break;
443: case ']':
444: state._token = "]";
445: return true;
446: case '>':
447: if ((state._pos < state._data.length)
448: && ((char) (state._data[state._pos]) == '>')) {
449: state._pos++;
450: state._token = ">>";
451: return true;
452: }
453: break;
454: case '%':
455: sb.append('%');
456: while ((state._pos < state._data.length)
457: && ((c = (char) (state._data[state._pos])) != '\n')
458: && (c != '\r')) {
459: sb.append(c);
460: state._pos++;
461: }
462: state._token = sb.toString();
463: return true;
464: default:
465: }
466: }
467: if ((string) || (hstring)) {
468: if (string) {
469: if ((c == '(') && (last != '\\')) {
470: stringParen++;
471: }
472: if ((c == ')') && (last != '\\')) {
473: if (stringParen == 1) {
474: sb.append(c);
475: state._token = sb.toString();
476: return true;
477: } else {
478: stringParen--;
479: }
480: }
481: } else {
482: // hex string
483: if (c == '>') {
484: sb.append(c);
485: state._token = sb.toString();
486: return true;
487: }
488: }
489: sb.append(c);
490: } else {
491: if (isWhitespace(c)) {
492: state._token = sb.toString();
493: return true;
494: } else {
495: switch (c) {
496: case '[':
497: if (!firstChar) {
498: state._pos--;
499: state._token = sb.toString();
500: return true;
501: } else {
502: state._token = "[";
503: return true;
504: }
505: case '<':
506: if (!firstChar) {
507: state._pos--;
508: state._token = sb.toString();
509: return true;
510: } else {
511: if ((state._pos < state._data.length)
512: && ((char) (state._data[state._pos]) == '<')) {
513: // dictionary
514: state._pos++;
515: state._token = "<<";
516: return true;
517: } else {
518: // hex string
519: hstring = true;
520: sb.append(c);
521: }
522: }
523: break;
524: case ']':
525: case '/':
526: case '(':
527: if (!firstChar) {
528: state._pos--;
529: state._token = sb.toString();
530: return true;
531: } else {
532: sb.append(c);
533: break;
534: }
535: case '>':
536: if ((state._pos < state._data.length)
537: && ((char) (state._data[state._pos]) == '>')) {
538: state._pos--;
539: state._token = sb.toString();
540: return true;
541: } else {
542: sb.append(c);
543: }
544: break;
545: default:
546: sb.append(c);
547: }
548: }
549: }
550: if (firstChar) {
551: firstChar = false;
552: }
553: }
554: return false;
555: }
556:
557: private static void skipWhitespace(PdfParserState state) {
558: while ((state._pos < state._data.length)
559: && (isWhitespace((char) (state._data[state._pos])))) {
560: state._pos++;
561: }
562: }
563:
564: private static boolean isWhitespace(char c) {
565: switch (c) {
566: case ' ':
567: case '\t':
568: case '\r':
569: case '\n':
570: return true;
571: default:
572: return false;
573: }
574: }
575:
576: }
|