001: package org.contineo.core.text.parser;
002:
003: import java.io.ByteArrayOutputStream;
004: import java.io.File;
005: import java.io.FileInputStream;
006: import java.io.IOException;
007: import java.io.InputStream;
008: import java.io.OutputStreamWriter;
009: import java.text.DateFormat;
010: import java.util.Calendar;
011: import java.util.Date;
012:
013: import org.apache.commons.logging.Log;
014: import org.apache.commons.logging.LogFactory;
015: import org.pdfbox.pdmodel.PDDocument;
016: import org.pdfbox.pdmodel.PDDocumentInformation;
017: import org.pdfbox.util.PDFTextStripper;
018:
019: /**
020: * Parses a PDF document and provides the information. For parsing an external
021: * library is used. Created on 4. November 2003, 18:09
022: *
023: * @author Michael Scholz
024: */
025: public class PDFParser implements Parser {
026: private StringBuffer content = new StringBuffer("");
027:
028: private String author;
029:
030: private String title;
031:
032: private String sourceDate;
033:
034: private String keywords;
035:
036: protected static Log log = LogFactory.getLog(PDFParser.class);
037:
038: public PDFParser(File file) {
039: author = "";
040: title = "";
041: sourceDate = "";
042: keywords = "";
043: init(file);
044: }
045:
046: protected void init(File file) {
047: PDDocument pdfDocument = null;
048:
049: try {
050: InputStream is = new FileInputStream(file);
051: org.pdfbox.pdfparser.PDFParser parser = new org.pdfbox.pdfparser.PDFParser(
052: is);
053:
054: if (parser != null) {
055: parser.parse();
056: } else {
057: throw new Exception("Can not parse pdf file "
058: + file.getName());
059: }
060:
061: pdfDocument = parser.getPDDocument();
062:
063: if (pdfDocument == null) {
064: throw new Exception("Can not get pdf document "
065: + file.getName() + " for parsing");
066: }
067:
068: try {
069: PDDocumentInformation information = pdfDocument
070: .getDocumentInformation();
071:
072: if (information == null) {
073: throw new Exception(
074: "Can not get information from pdf document "
075: + file.getName());
076: }
077:
078: author = information.getAuthor();
079:
080: if (author == null) {
081: author = "";
082: }
083:
084: title = information.getTitle();
085:
086: if (title == null) {
087: title = "";
088: }
089:
090: Calendar calendar = null;
091: try {
092: calendar = information.getCreationDate();
093: } catch (Throwable e) {
094: log.error("Bad date format " + e.getMessage());
095: }
096: Date date = null;
097:
098: if (calendar != null) {
099: date = calendar.getTime();
100: }
101:
102: if (date != null) {
103: sourceDate = DateFormat.getDateInstance().format(
104: date);
105: } else {
106: sourceDate = "";
107: }
108:
109: keywords = information.getKeywords();
110:
111: if (keywords == null) {
112: keywords = "";
113: }
114: } catch (Exception e) {
115: log.error(e.getMessage(), e);
116: }
117:
118: if (pdfDocument.isEncrypted()) {
119: try {
120: // Just try using the default password and move on
121: pdfDocument.decrypt("");
122: } catch (IOException e) {
123: log.error("Unable to decrypt pdf document");
124: }
125: }
126:
127: // create a tmp output stream with the size of the content.
128: ByteArrayOutputStream out = new ByteArrayOutputStream();
129: OutputStreamWriter writer = new OutputStreamWriter(out);
130: PDFTextStripper stripper = new PDFTextStripper();
131: try {
132: stripper.writeText(pdfDocument, writer);
133: } catch (IOException e) {
134: log.error("Unable to decrypt pdf document");
135: writer.write("encrypted document");
136: }
137: writer.close();
138: content = new StringBuffer(out.toString());
139: is.close();
140: out.close();
141: } catch (Exception ex) {
142: log.error(ex.getMessage(), ex);
143: } finally {
144: try {
145: if (pdfDocument != null) {
146: pdfDocument.close();
147: }
148: } catch (Exception e) {
149: log.fatal(e.getMessage(), e);
150: }
151: }
152: }
153:
154: /**
155: *
156: * @uml.property name="content"
157: */
158: public StringBuffer getContent() {
159: return content;
160: }
161:
162: public String getVersion() {
163: return "";
164: }
165:
166: /**
167: * @return Returns the author.
168: * @uml.property name="author"
169: */
170: public String getAuthor() {
171: return author;
172: }
173:
174: /**
175: * @return Returns the sourceDate.
176: * @uml.property name="sourceDate"
177: */
178: public String getSourceDate() {
179: return sourceDate;
180: }
181:
182: /**
183: * @return Returns the keywords.
184: * @uml.property name="keywords"
185: */
186: public String getKeywords() {
187: return keywords;
188: }
189:
190: /**
191: * @return Returns the title.
192: * @uml.property name="title"
193: */
194: public String getTitle() {
195: return title;
196: }
197: }
|