001: /*
002: * $Id: PDFDocument.java 414 2003-11-29 23:29:56Z mrgadget4711 $
003: *
004: * Filename : PDFDocument.java Project : vqwiki-classic
005: */
006: package vqwiki.utils.lucene;
007:
008: import java.io.ByteArrayOutputStream;
009: import java.io.File;
010: import java.io.FileInputStream;
011: import java.io.FileNotFoundException;
012: import java.io.IOException;
013: import java.io.OutputStreamWriter;
014:
015: import org.pdfbox.encryption.DecryptDocument;
016: import org.pdfbox.exceptions.CryptographyException;
017: import org.pdfbox.exceptions.InvalidPasswordException;
018: import org.pdfbox.pdfparser.PDFParser;
019: import org.pdfbox.pdmodel.PDDocument;
020: import org.pdfbox.util.PDFTextStripper;
021:
022: /**
023: * Get the content of a PDF file
024: *
025: * This class was created on 29.11.2003
026: *
027: * @author $Author: mrgadget4711 $
028: */
029: public class PDFDocument {
030:
031: /**
032: * Actually get the content of a PDF file
033: *
034: * @param attachmentFileName
035: * String with the filename of the file to read
036: * @param attachmentFile
037: * File handler of the file to read
038: * @return StringBuffer containing the (textual) content of the PDF file
039: * @throws FileNotFoundException
040: * If the PDF file cannot be found
041: * @throws IOException
042: * If the PDF file cannot be read
043: */
044: public static StringBuffer getContentOfPDFFile(
045: String attachmentFileName, File attachmentFile)
046: throws FileNotFoundException, IOException {
047: StringBuffer contents = new StringBuffer();
048: FileInputStream input = null;
049: try {
050: input = new FileInputStream(attachmentFile);
051: PDDocument pdfDocument = null;
052: try {
053: PDFParser parser = new PDFParser(input);
054: parser.parse();
055:
056: pdfDocument = parser.getPDDocument();
057:
058: if (pdfDocument.isEncrypted()) {
059: DecryptDocument decryptor = new DecryptDocument(
060: pdfDocument);
061: //Just try using the default password and move on
062: decryptor.decryptDocument("");
063: }
064:
065: //create a tmp output stream with the size of the content.
066: ByteArrayOutputStream out = new ByteArrayOutputStream();
067: OutputStreamWriter writer = new OutputStreamWriter(out);
068: PDFTextStripper stripper = new PDFTextStripper();
069: stripper.writeText(pdfDocument.getDocument(), writer);
070: writer.close();
071:
072: contents.append(" ");
073: contents.append(out.toString());
074: } catch (CryptographyException e) {
075: throw new IOException("Error decrypting document("
076: + attachmentFileName + "): " + e);
077: } catch (InvalidPasswordException e) {
078: //they didn't suppply a password and the default of "" was
079: // wrong.
080: throw new IOException("Error: The document("
081: + attachmentFileName
082: + ") is encrypted and will not be indexed.");
083: } finally {
084: if (pdfDocument != null) {
085: pdfDocument.close();
086: }
087: }
088: } finally {
089: if (input != null) {
090: input.close();
091: }
092: }
093: return contents;
094: }
095: }
096: /*
097: * Log:
098: *
099: * $Log$
100: * Revision 1.1 2003/11/29 23:29:56 mrgadget4711
101: * Initial version -- moved from AbstractSearchEngine
102: *
103: * ------------END------------
104: */
|