001: package org.openedit.sitesearch.parse;
002:
003: import java.io.ByteArrayInputStream;
004: import java.io.IOException;
005:
006: import org.apache.commons.logging.Log;
007: import org.apache.commons.logging.LogFactory;
008: import org.openedit.sitesearch.Content;
009: import org.openedit.sitesearch.Parse;
010: import org.openedit.sitesearch.Parser;
011: import org.pdfbox.encryption.DocumentEncryption;
012: import org.pdfbox.exceptions.CryptographyException;
013: import org.pdfbox.exceptions.InvalidPasswordException;
014: import org.pdfbox.pdfparser.PDFParser;
015: import org.pdfbox.pdmodel.PDDocument;
016: import org.pdfbox.pdmodel.PDDocumentInformation;
017: import org.pdfbox.util.PDFTextStripper;
018:
019: import com.openedit.OpenEditException;
020:
021: public class PdfParser implements Parser {
022: private static final Log log = LogFactory.getLog(PdfParser.class);
023:
024: public Parse getParse(Content content) throws OpenEditException {
025: log.info("Parse " + content.getUrl());
026: Parse results = new Parse();
027:
028: // in memory representation of pdf file
029: PDDocument pdf = null;
030:
031: try {
032:
033: byte[] raw = content.getContent();
034: if (raw == null) {
035: return null;
036: }
037: // String contentLength = content.get("Content-Length");
038: // if (contentLength != null && raw.length != Integer.parseInt(contentLength))
039: // {
040: // log.error( ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
041: // }
042:
043: PDFParser parser = new PDFParser(new ByteArrayInputStream(
044: raw));
045: parser.parse();
046:
047: pdf = parser.getPDDocument();
048:
049: if (pdf.isEncrypted()) {
050: DocumentEncryption decryptor = new DocumentEncryption(
051: pdf);
052: // Just try using the default password and move on
053: decryptor.decryptDocument("");
054: }
055:
056: // collect text
057: PDFTextStripper stripper = new PDFTextStripper();
058: String text = null;
059: String title = null;
060:
061: text = stripper.getText(pdf);
062: results.setText(text);
063:
064: // collect title
065: PDDocumentInformation info = pdf.getDocumentInformation();
066: title = info.getTitle();
067: results.setTitle(title);
068:
069: Thread.sleep(500); //Slow down PDF's loading
070: // use.
071: // pdf.getPageCount();
072: // info.getAuthor()
073: // info.getSubject()
074: // info.getKeywords()
075: // info.getCreator()
076: // info.getProducer()
077: // info.getTrapped()
078: // formatDate(info.getCreationDate())
079: // formatDate(info.getModificationDate())
080:
081: } catch (CryptographyException e) {
082: log.error("Error decrypting document. " + e);
083: } catch (InvalidPasswordException e) {
084: log
085: .error("Can't decrypt document - invalid password. "
086: + e);
087: } catch (Exception e) { // run time exception
088: log.error("Can't be handled as pdf document. " + e);
089: } finally {
090: try {
091: if (pdf != null)
092: pdf.close();
093: } catch (IOException e) {
094: // nothing to do
095: }
096: }
097:
098: // // collect outlink
099: // Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
100: //
101: // // collect meta data
102: // Properties metadata = new Properties();
103: // metadata.putAll(content.getMetadata()); // copy through
104: //
105: // ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
106: // return new ParseImpl(text, parseData);
107: return results;
108:
109: }
110:
111: }
|