001: /**
002: * LibreSource
003: * Copyright (C) 2004-2008 Artenum SARL / INRIA
004: * http://www.libresource.org - contact@artenum.com
005: *
006: * This file is part of the LibreSource software,
007: * which can be used and distributed under license conditions.
008: * The license conditions are provided in the LICENSE.TXT file
009: * at the root path of the packaging that enclose this file.
010: * More information can be found at
011: * - http://dev.libresource.org/home/license
012: *
013: * Initial authors :
014: *
015: * Guillaume Bort / INRIA
016: * Francois Charoy / Universite Nancy 2
017: * Julien Forest / Artenum
018: * Claude Godart / Universite Henry Poincare
019: * Florent Jouille / INRIA
020: * Sebastien Jourdain / INRIA / Artenum
021: * Yves Lerumeur / Artenum
022: * Pascal Molli / Universite Henry Poincare
023: * Gerald Oster / INRIA
024: * Mariarosa Penzi / Artenum
025: * Gerard Sookahet / Artenum
026: * Raphael Tani / INRIA
027: *
028: * Contributors :
029: *
030: * Stephane Bagnier / Artenum
031: * Amadou Dia / Artenum-IUP Blois
032: * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
033: */package org.libresource.files.parsers;
034:
035: import org.libresource.core.FileData;
036:
037: import org.pdfbox.encryption.DecryptDocument;
038:
039: import org.pdfbox.exceptions.CryptographyException;
040: import org.pdfbox.exceptions.InvalidPasswordException;
041:
042: import org.pdfbox.pdfparser.PDFParser;
043:
044: import org.pdfbox.pdmodel.PDDocument;
045: import org.pdfbox.pdmodel.PDDocumentInformation;
046:
047: import org.pdfbox.util.PDFTextStripper;
048:
049: import java.io.ByteArrayOutputStream;
050: import java.io.IOException;
051: import java.io.OutputStreamWriter;
052:
053: public class PDFContentParser implements FileContentParser {
054: public String parse(FileData fileData) throws Exception {
055: StringBuffer buffer = new StringBuffer();
056: PDDocument pdfDocument = null;
057:
058: try {
059: PDFParser parser = new PDFParser(fileData.getInputStream());
060: parser.parse();
061:
062: pdfDocument = parser.getPDDocument();
063:
064: if (pdfDocument.isEncrypted()) {
065: DecryptDocument decryptor = new DecryptDocument(
066: pdfDocument);
067:
068: //Just try using the default password and move on
069: decryptor.decryptDocument("");
070: }
071:
072: //create a tmp output stream with the size of the content.
073: ByteArrayOutputStream out = new ByteArrayOutputStream();
074: OutputStreamWriter writer = new OutputStreamWriter(out);
075: PDFTextStripper stripper = new PDFTextStripper();
076: stripper.writeText(pdfDocument, writer);
077: writer.close();
078:
079: buffer.append(new String(out.toByteArray()));
080:
081: PDDocumentInformation info = pdfDocument
082: .getDocumentInformation();
083:
084: if (info.getAuthor() != null) {
085: buffer.append(info.getAuthor() + " ");
086: }
087:
088: if (info.getCreator() != null) {
089: buffer.append(info.getCreator() + " ");
090: }
091:
092: if (info.getKeywords() != null) {
093: buffer.append(info.getKeywords() + " ");
094: }
095:
096: if (info.getProducer() != null) {
097: buffer.append(info.getProducer() + " ");
098: }
099:
100: if (info.getSubject() != null) {
101: buffer.append(info.getSubject() + " ");
102: }
103:
104: if (info.getTitle() != null) {
105: buffer.append(info.getTitle() + " ");
106: }
107:
108: if (info.getTrapped() != null) {
109: buffer.append(info.getTrapped() + " ");
110: }
111:
112: pdfDocument.close();
113: } catch (CryptographyException e) {
114: //throw new IOException("Error decrypting document " + e);
115: } catch (InvalidPasswordException e) {
116: //they didn't suppply a password and the default of "" was wrong.
117: //throw new IOException("Error: The document is encrypted and will not be indexed.");
118: } finally {
119: if (pdfDocument != null) {
120: pdfDocument.close();
121: }
122: }
123:
124: return buffer.toString();
125: }
126: }
|