001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Created on Jul 14, 2003
020: *
021: */
022: package org.archive.crawler.extractor;
023:
024: import com.lowagie.text.pdf.PdfReader;
025: import com.lowagie.text.pdf.PdfName;
026: import com.lowagie.text.pdf.PdfObject;
027: import com.lowagie.text.pdf.PdfDictionary;
028: import com.lowagie.text.pdf.PRIndirectReference;
029: import com.lowagie.text.pdf.PdfArray;
030:
031: import java.io.*;
032: import java.util.*;
033:
034: /** Supports PDF parsing operations. For now this primarily means
035: * extracting URIs, but the logic in extractURIs() could easily be adopted/extended
036: * for a variety of PDF processing tasks.
037: *
038: * @author Parker Thompson
039: *
040: */
041: //TODO make this more effecient, it currently had to read the whole file into memory
042: // before processing can begin, and appears to take much longer than it "should"
043: // to parse small, but admittedly complex, documents.
044: public class PDFParser {
045:
046: ArrayList<String> foundURIs;
047: ArrayList<ArrayList<Integer>> encounteredReferences;
048: PdfReader documentReader;
049: byte[] document;
050: PdfDictionary catalog;
051:
052: public PDFParser(String doc) throws IOException {
053: resetState();
054: getInFromFile(doc);
055: initialize();
056: }
057:
058: public PDFParser(byte[] doc) throws IOException {
059: resetState();
060: document = doc;
061: initialize();
062: }
063:
064: /** Reinitialize the object as though a new one were created.
065: */
066: protected void resetState() {
067: foundURIs = new ArrayList<String>();
068: encounteredReferences = new ArrayList<ArrayList<Integer>>();
069: documentReader = null;
070: document = null;
071: catalog = null;
072:
073: for (int i = 0; i < encounteredReferences.size(); i++) {
074: encounteredReferences.add(new ArrayList<Integer>());
075: }
076: }
077:
078: /**
079: * Reset the object and initialize it with a new byte array (the document).
080: * @param doc
081: * @throws IOException
082: */
083: public void resetState(byte[] doc) throws IOException {
084: resetState();
085: document = doc;
086: initialize();
087: }
088:
089: /** Reinitialize the object as though a new one were created, complete
090: * with a valid pointer to a document that can be read
091: * @param doc
092: * @throws IOException
093: */
094: public void resetState(String doc) throws IOException {
095: resetState();
096: getInFromFile(doc);
097: initialize();
098: }
099:
100: /**
101: * Read a file named 'doc' and store its' bytes for later processing.
102: * @param doc
103: * @throws IOException
104: */
105: protected void getInFromFile(String doc) throws IOException {
106: File documentOnDisk = new File(doc);
107:
108: long length = documentOnDisk.length();
109: document = new byte[(int) length];
110:
111: FileInputStream inStream = new FileInputStream(documentOnDisk);
112:
113: inStream.read(document);
114: }
115:
116: /**
117: * Indicates, based on a PDFObject's generation/id pair whether
118: * the parser has already encountered this object (or a reference to it)
119: * so we don't infinitely loop on circuits within the PDF.
120: * @param generation
121: * @param id
122: * @return True if already seen.
123: */
124: protected boolean haveSeen(int generation, int id) {
125:
126: // if we can't store this generation grow our list until we can
127: if (generation >= encounteredReferences.size()) {
128: for (int i = encounteredReferences.size(); i <= generation; i++) {
129: encounteredReferences.add(new ArrayList<Integer>());
130: }
131:
132: // clearly we haven't seen it
133: return false;
134: }
135:
136: ArrayList<Integer> generationList = encounteredReferences
137: .get(generation);
138:
139: for (int i : generationList) {
140: if (i == id) {
141: return true;
142: }
143: }
144: return false;
145: }
146:
147: /**
148: * Note that an object (id/generation pair) has been seen by this parser
149: * so that it can be handled differently when it is encountered again.
150: * @param generation
151: * @param id
152: */
153: protected void markAsSeen(int generation, int id) {
154: ArrayList<Integer> objectIds = encounteredReferences
155: .get(generation);
156: objectIds.add(id);
157: }
158:
159: /**
160: * Get a list of URIs retrieved from the Pdf during the
161: * extractURIs operation.
162: * @return A list of URIs retrieved from the Pdf during the
163: * extractURIs operation.
164: */
165: public ArrayList getURIs() {
166: return foundURIs;
167: }
168:
169: /**
170: * Initialize opens the document for reading. This is done implicitly
171: * by the constuctor. This should only need to be called directly following
172: * a reset.
173: * @throws IOException
174: */
175: protected void initialize() throws IOException {
176: if (document != null) {
177: documentReader = new PdfReader(document);
178: }
179:
180: catalog = documentReader.getCatalog();
181: }
182:
183: /**
184: * Extract URIs from all objects found in a Pdf document's catalog.
185: * Returns an array list representing all URIs found in the document catalog tree.
186: * @return URIs from all objects found in a Pdf document's catalog.
187: */
188: public ArrayList extractURIs() {
189: extractURIs(catalog);
190: return getURIs();
191: }
192:
193: /**
194: * Parse a PdfDictionary, looking for URIs recursively and adding
195: * them to foundURIs
196: * @param entity
197: */
198: protected void extractURIs(PdfObject entity) {
199:
200: // deal with dictionaries
201: if (entity.isDictionary()) {
202:
203: PdfDictionary dictionary = (PdfDictionary) entity;
204:
205: @SuppressWarnings("unchecked")
206: Set<PdfName> allkeys = dictionary.getKeys();
207: for (PdfName key : allkeys) {
208: PdfObject value = dictionary.get(key);
209:
210: // see if it's the key is a UR[I,L]
211: if (key.toString().equals("/URI")
212: || key.toString().equals("/URL")) {
213: foundURIs.add(value.toString());
214:
215: } else {
216: this .extractURIs(value);
217: }
218:
219: }
220:
221: // deal with arrays
222: } else if (entity.isArray()) {
223:
224: PdfArray array = (PdfArray) entity;
225: ArrayList arrayObjects = array.getArrayList();
226: Iterator objectList = arrayObjects.iterator();
227:
228: while (objectList.hasNext()) {
229: this .extractURIs((PdfObject) objectList.next());
230: }
231:
232: // deal with indirect references
233: } else if (entity.getClass() == PRIndirectReference.class) {
234:
235: PRIndirectReference indirect = (PRIndirectReference) entity;
236:
237: // if we've already seen a reference to this object
238: if (haveSeen(indirect.getGeneration(), indirect.getNumber())) {
239: return;
240:
241: // note that we've seen it if it's new
242: } else {
243: markAsSeen(indirect.getGeneration(), indirect
244: .getNumber());
245: }
246:
247: // dereference the "pointer" and process the object
248: indirect.getReader(); // FIXME: examine side-effects
249: PdfObject direct = PdfReader.getPdfObject(indirect);
250:
251: this .extractURIs(direct);
252: }
253: }
254:
255: public static void main(String[] argv) {
256:
257: try {
258: PDFParser parser = new PDFParser(
259: "/home/parkert/files/pdfspec.pdf");
260:
261: ArrayList uris = parser.extractURIs();
262:
263: Iterator i = uris.iterator();
264:
265: while (i.hasNext()) {
266: String uri = (String) i.next();
267: System.out.println("got uri: " + uri);
268: }
269:
270: } catch (IOException e) {
271: e.printStackTrace();
272: }
273: }
274: }
|