001: /*
002: * $Id: PRTokeniser.java 2517 2006-12-28 19:41:02Z psoares33 $
003: *
004: * Copyright 2001, 2002 by Paulo Soares.
005: *
006: * The contents of this file are subject to the Mozilla Public License Version 1.1
007: * (the "License"); you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at http://www.mozilla.org/MPL/
009: *
010: * Software distributed under the License is distributed on an "AS IS" basis,
011: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
012: * for the specific language governing rights and limitations under the License.
013: *
014: * The Original Code is 'iText, a free JAVA-PDF library'.
015: *
016: * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
017: * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
018: * All Rights Reserved.
019: * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
020: * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
021: *
022: * Contributor(s): all the names of the contributors are added in the source code
023: * where applicable.
024: *
025: * Alternatively, the contents of this file may be used under the terms of the
026: * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
027: * provisions of LGPL are applicable instead of those above. If you wish to
028: * allow use of your version of this file only under the terms of the LGPL
029: * License and not to allow others to use your version of this file under
030: * the MPL, indicate your decision by deleting the provisions above and
031: * replace them with the notice and other provisions required by the LGPL.
032: * If you do not delete the provisions above, a recipient may use your version
033: * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
034: *
035: * This library is free software; you can redistribute it and/or modify it
036: * under the terms of the MPL as stated above or under the terms of the GNU
037: * Library General Public License as published by the Free Software Foundation;
038: * either version 2 of the License, or any later version.
039: *
040: * This library is distributed in the hope that it will be useful, but WITHOUT
041: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
042: * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
043: * details.
044: *
045: * If you didn't download this code from the following link, you should check if
046: * you aren't using an obsolete version:
047: * http://www.lowagie.com/iText/
048: */
049:
050: package com.lowagie.text.pdf;
051:
052: import java.io.IOException;
053:
054: /**
055: *
056: * @author Paulo Soares (psoares@consiste.pt)
057: */
058: public class PRTokeniser {
059:
060: public static final int TK_NUMBER = 1;
061: public static final int TK_STRING = 2;
062: public static final int TK_NAME = 3;
063: public static final int TK_COMMENT = 4;
064: public static final int TK_START_ARRAY = 5;
065: public static final int TK_END_ARRAY = 6;
066: public static final int TK_START_DIC = 7;
067: public static final int TK_END_DIC = 8;
068: public static final int TK_REF = 9;
069: public static final int TK_OTHER = 10;
070: public static final boolean delims[] = { true, true, false, false,
071: false, false, false, false, false, false, true, true,
072: false, true, true, false, false, false, false, false,
073: false, false, false, false, false, false, false, false,
074: false, false, false, false, false, true, false, false,
075: false, false, true, false, false, true, true, false, false,
076: false, false, false, true, false, false, false, false,
077: false, false, false, false, false, false, false, false,
078: true, false, true, false, false, false, false, false,
079: false, false, false, false, false, false, false, false,
080: false, false, false, false, false, false, false, false,
081: false, false, false, false, false, false, false, true,
082: false, true, false, false, false, false, false, false,
083: false, false, false, false, false, false, false, false,
084: false, false, false, false, false, false, false, false,
085: false, false, false, false, false, false, false, false,
086: false, false, false, false, false, false, false, false,
087: false, false, false, false, false, false, false, false,
088: false, false, false, false, false, false, false, false,
089: false, false, false, false, false, false, false, false,
090: false, false, false, false, false, false, false, false,
091: false, false, false, false, false, false, false, false,
092: false, false, false, false, false, false, false, false,
093: false, false, false, false, false, false, false, false,
094: false, false, false, false, false, false, false, false,
095: false, false, false, false, false, false, false, false,
096: false, false, false, false, false, false, false, false,
097: false, false, false, false, false, false, false, false,
098: false, false, false, false, false, false, false, false,
099: false, false, false, false, false, false, false, false,
100: false, false, false, false, false, false, false, false,
101: false, false, false, false, false, false, false, false,
102: false, false, false, false };
103:
104: static final String EMPTY = "";
105:
106: protected RandomAccessFileOrArray file;
107: protected int type;
108: protected String stringValue;
109: protected int reference;
110: protected int generation;
111: protected boolean hexString;
112:
113: public PRTokeniser(String filename) throws IOException {
114: file = new RandomAccessFileOrArray(filename);
115: }
116:
117: public PRTokeniser(byte pdfIn[]) {
118: file = new RandomAccessFileOrArray(pdfIn);
119: }
120:
121: public PRTokeniser(RandomAccessFileOrArray file) {
122: this .file = file;
123: }
124:
125: public void seek(int pos) throws IOException {
126: file.seek(pos);
127: }
128:
129: public int getFilePointer() throws IOException {
130: return file.getFilePointer();
131: }
132:
133: public void close() throws IOException {
134: file.close();
135: }
136:
137: public int length() throws IOException {
138: return file.length();
139: }
140:
141: public int read() throws IOException {
142: return file.read();
143: }
144:
145: public RandomAccessFileOrArray getSafeFile() {
146: return new RandomAccessFileOrArray(file);
147: }
148:
149: public RandomAccessFileOrArray getFile() {
150: return file;
151: }
152:
153: public String readString(int size) throws IOException {
154: StringBuffer buf = new StringBuffer();
155: int ch;
156: while ((size--) > 0) {
157: ch = file.read();
158: if (ch == -1)
159: break;
160: buf.append((char) ch);
161: }
162: return buf.toString();
163: }
164:
165: public static final boolean isWhitespace(int ch) {
166: return (ch == 0 || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32);
167: }
168:
169: public static final boolean isDelimiter(int ch) {
170: return (ch == '(' || ch == ')' || ch == '<' || ch == '>'
171: || ch == '[' || ch == ']' || ch == '/' || ch == '%');
172: }
173:
174: public static final boolean isDelimiterWhitespace(int ch) {
175: return delims[ch + 1];
176: }
177:
178: public int getTokenType() {
179: return type;
180: }
181:
182: public String getStringValue() {
183: return stringValue;
184: }
185:
186: public int getReference() {
187: return reference;
188: }
189:
190: public int getGeneration() {
191: return generation;
192: }
193:
194: public void backOnePosition(int ch) {
195: if (ch != -1)
196: file.pushBack((byte) ch);
197: }
198:
199: public void throwError(String error) throws IOException {
200: throw new IOException(error + " at file pointer "
201: + file.getFilePointer());
202: }
203:
204: public char checkPdfHeader() throws IOException {
205: file.setStartOffset(0);
206: String str = readString(1024);
207: int idx = str.indexOf("%PDF-");
208: if (idx < 0)
209: throw new IOException("PDF header signature not found.");
210: file.setStartOffset(idx);
211: return str.charAt(idx + 7);
212: }
213:
214: public void checkFdfHeader() throws IOException {
215: file.setStartOffset(0);
216: String str = readString(1024);
217: int idx = str.indexOf("%FDF-1.2");
218: if (idx < 0)
219: throw new IOException("FDF header signature not found.");
220: file.setStartOffset(idx);
221: }
222:
223: public int getStartxref() throws IOException {
224: int size = Math.min(1024, file.length());
225: int pos = file.length() - size;
226: file.seek(pos);
227: String str = readString(1024);
228: int idx = str.lastIndexOf("startxref");
229: if (idx < 0)
230: throw new IOException("PDF startxref not found.");
231: return pos + idx;
232: }
233:
234: public static int getHex(int v) {
235: if (v >= '0' && v <= '9')
236: return v - '0';
237: if (v >= 'A' && v <= 'F')
238: return v - 'A' + 10;
239: if (v >= 'a' && v <= 'f')
240: return v - 'a' + 10;
241: return -1;
242: }
243:
244: public void nextValidToken() throws IOException {
245: int level = 0;
246: String n1 = null;
247: String n2 = null;
248: int ptr = 0;
249: while (nextToken()) {
250: if (type == TK_COMMENT)
251: continue;
252: switch (level) {
253: case 0: {
254: if (type != TK_NUMBER)
255: return;
256: ptr = file.getFilePointer();
257: n1 = stringValue;
258: ++level;
259: break;
260: }
261: case 1: {
262: if (type != TK_NUMBER) {
263: file.seek(ptr);
264: type = TK_NUMBER;
265: stringValue = n1;
266: return;
267: }
268: n2 = stringValue;
269: ++level;
270: break;
271: }
272: default: {
273: if (type != TK_OTHER || !stringValue.equals("R")) {
274: file.seek(ptr);
275: type = TK_NUMBER;
276: stringValue = n1;
277: return;
278: }
279: type = TK_REF;
280: reference = Integer.parseInt(n1);
281: generation = Integer.parseInt(n2);
282: return;
283: }
284: }
285: }
286: throwError("Unexpected end of file");
287: }
288:
289: public boolean nextToken() throws IOException {
290: StringBuffer outBuf = null;
291: stringValue = EMPTY;
292: int ch = 0;
293: do {
294: ch = file.read();
295: } while (ch != -1 && isWhitespace(ch));
296: if (ch == -1)
297: return false;
298: switch (ch) {
299: case '[':
300: type = TK_START_ARRAY;
301: break;
302: case ']':
303: type = TK_END_ARRAY;
304: break;
305: case '/': {
306: outBuf = new StringBuffer();
307: type = TK_NAME;
308: while (true) {
309: ch = file.read();
310: if (delims[ch + 1])
311: break;
312: if (ch == '#') {
313: ch = (getHex(file.read()) << 4)
314: + getHex(file.read());
315: }
316: outBuf.append((char) ch);
317: }
318: backOnePosition(ch);
319: break;
320: }
321: case '>':
322: ch = file.read();
323: if (ch != '>')
324: throwError("'>' not expected");
325: type = TK_END_DIC;
326: break;
327: case '<': {
328: int v1 = file.read();
329: if (v1 == '<') {
330: type = TK_START_DIC;
331: break;
332: }
333: outBuf = new StringBuffer();
334: type = TK_STRING;
335: hexString = true;
336: int v2 = 0;
337: while (true) {
338: while (isWhitespace(v1))
339: v1 = file.read();
340: if (v1 == '>')
341: break;
342: v1 = getHex(v1);
343: if (v1 < 0)
344: break;
345: v2 = file.read();
346: while (isWhitespace(v2))
347: v2 = file.read();
348: if (v2 == '>') {
349: ch = v1 << 4;
350: outBuf.append((char) ch);
351: break;
352: }
353: v2 = getHex(v2);
354: if (v2 < 0)
355: break;
356: ch = (v1 << 4) + v2;
357: outBuf.append((char) ch);
358: v1 = file.read();
359: }
360: if (v1 < 0 || v2 < 0)
361: throwError("Error reading string");
362: break;
363: }
364: case '%':
365: type = TK_COMMENT;
366: do {
367: ch = file.read();
368: } while (ch != -1 && ch != '\r' && ch != '\n');
369: break;
370: case '(': {
371: outBuf = new StringBuffer();
372: type = TK_STRING;
373: hexString = false;
374: int nesting = 0;
375: while (true) {
376: ch = file.read();
377: if (ch == -1)
378: break;
379: if (ch == '(') {
380: ++nesting;
381: } else if (ch == ')') {
382: --nesting;
383: } else if (ch == '\\') {
384: boolean lineBreak = false;
385: ch = file.read();
386: switch (ch) {
387: case 'n':
388: ch = '\n';
389: break;
390: case 'r':
391: ch = '\r';
392: break;
393: case 't':
394: ch = '\t';
395: break;
396: case 'b':
397: ch = '\b';
398: break;
399: case 'f':
400: ch = '\f';
401: break;
402: case '(':
403: case ')':
404: case '\\':
405: break;
406: case '\r':
407: lineBreak = true;
408: ch = file.read();
409: if (ch != '\n')
410: backOnePosition(ch);
411: break;
412: case '\n':
413: lineBreak = true;
414: break;
415: default: {
416: if (ch < '0' || ch > '7') {
417: break;
418: }
419: int octal = ch - '0';
420: ch = file.read();
421: if (ch < '0' || ch > '7') {
422: backOnePosition(ch);
423: ch = octal;
424: break;
425: }
426: octal = (octal << 3) + ch - '0';
427: ch = file.read();
428: if (ch < '0' || ch > '7') {
429: backOnePosition(ch);
430: ch = octal;
431: break;
432: }
433: octal = (octal << 3) + ch - '0';
434: ch = octal & 0xff;
435: break;
436: }
437: }
438: if (lineBreak)
439: continue;
440: if (ch < 0)
441: break;
442: } else if (ch == '\r') {
443: ch = file.read();
444: if (ch < 0)
445: break;
446: if (ch != '\n') {
447: backOnePosition(ch);
448: ch = '\n';
449: }
450: }
451: if (nesting == -1)
452: break;
453: outBuf.append((char) ch);
454: }
455: if (ch == -1)
456: throwError("Error reading string");
457: break;
458: }
459: default: {
460: outBuf = new StringBuffer();
461: if (ch == '-' || ch == '+' || ch == '.'
462: || (ch >= '0' && ch <= '9')) {
463: type = TK_NUMBER;
464: do {
465: outBuf.append((char) ch);
466: ch = file.read();
467: } while (ch != -1
468: && ((ch >= '0' && ch <= '9') || ch == '.'));
469: } else {
470: type = TK_OTHER;
471: do {
472: outBuf.append((char) ch);
473: ch = file.read();
474: } while (!delims[ch + 1]);
475: }
476: backOnePosition(ch);
477: break;
478: }
479: }
480: if (outBuf != null)
481: stringValue = outBuf.toString();
482: return true;
483: }
484:
485: public int intValue() {
486: return Integer.parseInt(stringValue);
487: }
488:
489: public boolean readLineSegment(byte input[]) throws IOException {
490: int c = -1;
491: boolean eol = false;
492: int ptr = 0;
493: int len = input.length;
494: // ssteward, pdftk-1.10, 040922:
495: // skip initial whitespace; added this because PdfReader.rebuildXref()
496: // assumes that line provided by readLineSegment does not have init. whitespace;
497: if (ptr < len) {
498: while (isWhitespace((c = read())))
499: ;
500: }
501: while (!eol && ptr < len) {
502: switch (c) {
503: case -1:
504: case '\n':
505: eol = true;
506: break;
507: case '\r':
508: eol = true;
509: int cur = getFilePointer();
510: if ((read()) != '\n') {
511: seek(cur);
512: }
513: break;
514: default:
515: input[ptr++] = (byte) c;
516: break;
517: }
518:
519: // break loop? do it before we read() again
520: if (eol || len <= ptr) {
521: break;
522: } else {
523: c = read();
524: }
525: }
526: if (ptr >= len) {
527: eol = false;
528: while (!eol) {
529: switch (c = read()) {
530: case -1:
531: case '\n':
532: eol = true;
533: break;
534: case '\r':
535: eol = true;
536: int cur = getFilePointer();
537: if ((read()) != '\n') {
538: seek(cur);
539: }
540: break;
541: }
542: }
543: }
544:
545: if ((c == -1) && (ptr == 0)) {
546: return false;
547: }
548: if (ptr + 2 <= len) {
549: input[ptr++] = (byte) ' ';
550: input[ptr] = (byte) 'X';
551: }
552: return true;
553: }
554:
555: public static int[] checkObjectStart(byte line[]) {
556: try {
557: PRTokeniser tk = new PRTokeniser(line);
558: int num = 0;
559: int gen = 0;
560: if (!tk.nextToken() || tk.getTokenType() != TK_NUMBER)
561: return null;
562: num = tk.intValue();
563: if (!tk.nextToken() || tk.getTokenType() != TK_NUMBER)
564: return null;
565: gen = tk.intValue();
566: if (!tk.nextToken())
567: return null;
568: if (!tk.getStringValue().equals("obj"))
569: return null;
570: return new int[] { num, gen };
571: } catch (Exception ioe) {
572: // empty on purpose
573: }
574: return null;
575: }
576:
577: public boolean isHexString() {
578: return this.hexString;
579: }
580:
581: }
|