001: /*
002: * Copyright (c) 2007, intarsys consulting GmbH
003: *
004: * Redistribution and use in source and binary forms, with or without
005: * modification, are permitted provided that the following conditions are met:
006: *
007: * - Redistributions of source code must retain the above copyright notice,
008: * this list of conditions and the following disclaimer.
009: *
010: * - Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: *
014: * - Neither the name of intarsys nor the names of its contributors may be used
015: * to endorse or promote products derived from this software without specific
016: * prior written permission.
017: *
018: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
019: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
020: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
021: * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
022: * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
023: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
024: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
025: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
026: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
027: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
028: * POSSIBILITY OF SUCH DAMAGE.
029: */
030: package de.intarsys.pdf.parser;
031:
032: import java.io.IOException;
033: import java.util.ArrayList;
034: import java.util.Arrays;
035: import java.util.List;
036:
037: import de.intarsys.pdf.cos.COSDictionary;
038: import de.intarsys.pdf.cos.COSDocumentElement;
039: import de.intarsys.pdf.cos.COSIndirectObject;
040: import de.intarsys.pdf.cos.COSInteger;
041: import de.intarsys.pdf.cos.COSName;
042: import de.intarsys.pdf.cos.COSNull;
043: import de.intarsys.pdf.cos.COSObject;
044: import de.intarsys.pdf.cos.COSObjectKey;
045: import de.intarsys.pdf.crypt.ISystemSecurityHandler;
046: import de.intarsys.pdf.st.STDocument;
047: import de.intarsys.tools.randomaccess.IRandomAccess;
048:
049: /**
050: * A parser for PDF data streams.
051: *
052: * <p>
053: * The parser will create a object representation of the pdf document using COS
054: * level objects.
055: * </p>
056: *
057: * <p>
058: * The parser is a one pass, read everything implementation.
059: * </p>
060: */
061: public class COSDocumentParser extends PDFParser {
062: private STDocument doc;
063:
064: public COSDocumentParser(STDocument doc) {
065: this .doc = doc;
066: }
067:
068: public boolean isTokenXRefAt(IRandomAccess input, int offset)
069: throws IOException {
070: input.seek(offset);
071: readSpaces(input);
072: byte[] token = new byte[4];
073: input.read(token);
074: return Arrays.equals(token, PDFParser.TOKEN_xref);
075: }
076:
077: /*
078: * create a COS reference out of the objects in the parsers lookahead see
079: * PDF Reference v1.4, chapter 3.2.9 Indirect objects COSReference ::=
080: * objNum genNum "R"
081: *
082: * @return the reference object created
083: *
084: * @throws IOException @throws COSLoadException
085: */
086: protected COSIndirectObject createObjectReference()
087: throws IOException, COSLoadException {
088: COSObject obj1 = lookaheadPop();
089: COSObject obj2 = lookaheadPop();
090: if (!(obj1 instanceof COSInteger)
091: || !(obj2 instanceof COSInteger)) {
092: COSLoadError e = new COSLoadError("illegal reference");
093: handleError(e);
094: }
095: COSObjectKey key = new COSObjectKey(((COSInteger) obj1)
096: .intValue(), ((COSInteger) obj2).intValue());
097: return getDoc().getObjectReference(key);
098: }
099:
100: /**
101: * read a pdf style object from the input. see PDF Reference v1.4, chapter
102: * 3.2.9 Indirect Objects COSIndirectObject ::= ObjNum GenNum "obj" Object
103: * "endobj"
104: *
105: * @return The parsed object.
106: *
107: * @throws IOException
108: * @throws COSLoadException
109: */
110: public COSObject parseIndirectObject(IRandomAccess input,
111: ISystemSecurityHandler securityHandler) throws IOException,
112: COSLoadException {
113: COSObjectKey key = parseIndirectObjectKey(input);
114:
115: // this may be called recursive in a stream for its length
116: COSObjectKey oldObjectKey = getObjectKey();
117: ISystemSecurityHandler oldSecurityHandler = getSecurityHandler();
118: try {
119: setObjectKey(key);
120: setSecurityHandler(securityHandler);
121: COSObject object = parseIndirectObjectBody(input);
122: if (object == null) {
123: object = COSNull.NULL;
124: }
125: return object;
126: } finally {
127: setObjectKey(oldObjectKey);
128: setSecurityHandler(oldSecurityHandler);
129: }
130: }
131:
132: protected COSObject parseIndirectObjectBody(IRandomAccess input)
133: throws IOException, COSLoadException {
134: byte[] token;
135: COSDocumentElement element = parseObject(input);
136: if (element == null) {
137: COSLoadError e = new COSLoadError("object expected");
138: handleError(e);
139: }
140: if (element.isReference()) {
141: COSLoadError e = new COSLoadError(
142: "object reference not allowed in a indirect object");
143: handleError(e);
144: }
145: token = readToken(input);
146: if (token == null) {
147: COSLoadError e = new COSLoadError("unexpected end of file");
148: handleError(e);
149: }
150: if (!Arrays.equals(token, TOKEN_endobj)) {
151: input.seekBy(-token.length);
152: COSLoadWarning w = new COSLoadWarning(C_WARN_ENDOBJ_MISSING);
153: handleWarning(w);
154: }
155: return (COSObject) element;
156: }
157:
158: protected COSObjectKey parseIndirectObjectKey(IRandomAccess input)
159: throws IOException, COSLoadException {
160: byte[] token;
161:
162: // pdfa compliance here, must verify the existence of a single space
163: // between
164: // object number, generation and obj keyword
165: List messages = new ArrayList();
166:
167: // object number
168: if (check) {
169: token = readToken(input, messages);
170: } else {
171: token = readToken(input);
172: }
173: if (token == null) {
174: COSLoadError e = new COSLoadError("unexpected end of file");
175: handleError(e);
176: }
177: int objNumber = 0;
178: try {
179: objNumber = Integer.parseInt(new String(token));
180: if (messages.size() > 0) {
181: COSLoadWarning pwarn = new COSLoadWarning(
182: C_WARN_SINGLESPACE_OBJ);
183: pwarn.setHint(new Long(input.getOffset()));
184: handleWarning(pwarn);
185: }
186: } catch (NumberFormatException ex) {
187: COSLoadError e = new COSLoadError("invalid object number");
188: handleError(e);
189: }
190:
191: // generation number
192: messages.clear();
193: if (check) {
194: token = readToken(input, messages);
195: } else {
196: token = readToken(input);
197: }
198:
199: if (token == null) {
200: COSLoadError e = new COSLoadError("unexpected end of file");
201: handleError(e);
202: }
203: int genNumber = 0;
204: try {
205: genNumber = Integer.parseInt(new String(token));
206: if (messages.size() > 0) {
207: COSLoadWarning pwarn = new COSLoadWarning(
208: C_WARN_SINGLESPACE_OBJ);
209: pwarn.setHint(new Long(input.getOffset()));
210: handleWarning(pwarn);
211: }
212: } catch (NumberFormatException ex) {
213: COSLoadError e = new COSLoadError(
214: "invalid generation number ");
215: handleError(e);
216: }
217:
218: // obj keyword
219: token = readToken(input);
220: if (token == null) {
221: COSLoadError e = new COSLoadError("unexpected end of file");
222: handleError(e);
223: }
224: if (!Arrays.equals(token, TOKEN_obj)) {
225: input.seekBy(-token.length);
226: COSLoadError e = new COSLoadError(
227: "file format error, obj expected");
228: handleError(e);
229: }
230: if (check) {
231: if (!readEOL(input)) {
232: COSLoadWarning pwarn = new COSLoadWarning(
233: C_WARN_SINGLEEOL_OBJ);
234: pwarn.setHint(new Long(input.getOffset()));
235: handleWarning(pwarn);
236: }
237: } else {
238: readSpaces(input);
239: }
240: return new COSObjectKey(objNumber, genNumber);
241: }
242:
243: /**
244: * Searches the offset to the first trailer in the last 1024 bytes of the
245: * document. The search goes backwards starting with the last byte.
246: *
247: * @return the offset to the first trailer found
248: * @throws IOException
249: * @throws COSLoadException
250: */
251: public int searchLastStartXRef(IRandomAccess input)
252: throws IOException, COSLoadException {
253: long startOffset = input.getLength() - 1024;
254: if (startOffset < 0) {
255: startOffset = 0;
256: }
257: input.seek(startOffset);
258: byte[] buffer = new byte[1024];
259: int bytesRead = input.read(buffer);
260:
261: boolean found = false;
262: int bufferOffset;
263: for (bufferOffset = bytesRead - TOKEN_startxref.length; bufferOffset > 0; bufferOffset--) {
264: for (int j = 0; j < TOKEN_startxref.length; j++) {
265: if (buffer[bufferOffset + j] == TOKEN_startxref[j]) {
266: found = true;
267: } else {
268: found = false;
269: break;
270: }
271: }
272: if (found) {
273: break;
274: }
275: }
276: if (found) {
277: long startXRefOffset = startOffset + bufferOffset;
278: input.seek(startXRefOffset);
279: return parseStartXRef(input);
280: }
281: COSLoadError e = new COSLoadError(
282: "no startxref found in the last 1024 bytes of the document");
283: handleError(e);
284: return -1;
285: }
286:
287: /**
288: * the startxref value.
289: *
290: * @return the startxref value
291: * @throws IOException
292: * @throws COSLoadException
293: */
294: public int parseStartXRef(IRandomAccess input) throws IOException,
295: COSLoadException {
296: readSpaces(input);
297: byte[] token = new byte[9];
298: input.read(token);
299: if (!Arrays.equals(token, PDFParser.TOKEN_startxref)) {
300: COSLoadError e = new COSLoadError(
301: "file format error. 'startxref' expected at offset:"
302: + (input.getOffset() - 9));
303: handleError(e);
304: }
305: return readInteger(input, true);
306: }
307:
308: /**
309: * @deprecated Don't use this anymore
310: *
311: * Returns the offset of the dictionary with linearization parameters if
312: * any. Returns -1 otherwise.
313: *
314: * @param input
315: * @return Returns the offset of the dictionary with linearization
316: * parameters if any.
317: * @throws IOException
318: * @throws COSLoadException
319: */
320: public int searchLinearized(IRandomAccess input)
321: throws IOException, COSLoadException {
322: long oldOffset = input.getOffset();
323: int result = -1;
324: input.seek(0);
325: parseComment(input); // file header
326:
327: int next = input.read();
328: while (true) {
329: if (next == -1) {
330: break;
331: }
332: if ((next == ' ') || isWhitespace(next)) { // performance shortcut
333: next = input.read();
334: continue;
335: }
336: input.seekBy(-1);
337: if (isDigit(next)) {
338: result = (int) input.getOffset();
339: COSDocumentElement cosobj = parseIndirectObject(input,
340: null);
341: COSName linearized = COSName.constant("Linearized");
342: if (cosobj instanceof COSDictionary
343: && ((COSDictionary) cosobj)
344: .containsKey(linearized)) {
345: return result;
346: }
347: result = -1;
348: break;
349: } else if (next == '%') {
350: parseComment(input); // this is the binary comment
351: } else {
352: break;
353: }
354: next = input.read();
355: }
356:
357: // reset randomaccess
358: input.seek(oldOffset);
359: return result;
360: }
361:
362: /**
363: * parse the trailer section from the current stream position. see PDF
364: * Reference v1.4, chapter 3.4.4 File Trailer DocumentTrailer ::= "trailer"
365: * COSDict "startxref" COSNumber
366: *
367: * @return the trailer dictionary
368: *
369: * @throws IOException
370: * @throws COSLoadException
371: */
372: public COSDictionary parseTrailer(IRandomAccess input)
373: throws IOException, COSLoadException {
374: byte[] token = new byte[7];
375: int bytesRead = input.read(token);
376: if (!Arrays.equals(token, TOKEN_trailer)) {
377: if (bytesRead > 0) {
378: input.seekBy(-bytesRead);
379: }
380: COSLoadError e = new COSLoadError(
381: "file format error. 'trailer' expected");
382: handleError(e);
383: }
384: readSpaces(input);
385: COSDictionary trailerDict = (COSDictionary) parseObjectDictionary(input);
386: readSpaces(input);
387: return trailerDict;
388: }
389:
390: public STDocument getDoc() {
391: return doc;
392: }
393: }
|