001: /***************************************************************
002: * This file is part of the [fleXive](R) project.
003: *
004: * Copyright (c) 1999-2008
005: * UCS - unique computing solutions gmbh (http://www.ucs.at)
006: * All rights reserved
007: *
008: * The [fleXive](R) project is free software; you can redistribute
009: * it and/or modify it under the terms of the GNU General Public
010: * License as published by the Free Software Foundation;
011: * either version 2 of the License, or (at your option) any
012: * later version.
013: *
014: * The GNU General Public License can be found at
015: * http://www.gnu.org/copyleft/gpl.html.
016: * A copy is found in the textfile GPL.txt and important notices to the
017: * license from the author are found in LICENSE.txt distributed with
018: * these libraries.
019: *
020: * This library is distributed in the hope that it will be useful,
021: * but WITHOUT ANY WARRANTY; without even the implied warranty of
022: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
023: * GNU General Public License for more details.
024: *
025: * For further information about UCS - unique computing solutions gmbh,
026: * please see the company website: http://www.ucs.at
027: *
028: * For further information about [fleXive](R), please see the
029: * project website: http://www.flexive.org
030: *
031: *
032: * This copyright notice MUST APPEAR in all copies of the file!
033: ***************************************************************/package com.flexive.extractor;
034:
035: import org.apache.poi.hpsf.PropertySetFactory;
036: import org.apache.poi.hpsf.SummaryInformation;
037: import org.apache.poi.poifs.eventfilesystem.POIFSReader;
038: import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
039: import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
040: import org.pdfbox.pdmodel.PDDocument;
041: import org.pdfbox.pdmodel.PDDocumentInformation;
042:
043: import java.io.FileInputStream;
044: import java.io.InputStream;
045: import java.io.Serializable;
046: import java.util.Date;
047:
048: /**
049: * A class storing informations about documents (pdf, word, excel, ...)
050: */
051: public class FxSummaryInformation implements Serializable {
052: String author;
053: String applicationName;
054: long charCount;
055: String comments;
056: Date createdAt;
057: Date editTime;
058: String keywords;
059: String lastModifiedBy;
060: Date lastPrintedAt;
061: String title;
062: Date lastModifiedAt;
063: int pageCount;
064: String revNumber;
065: int wordCount;
066: boolean encrypted;
067: String additionalText;
068:
069: public FxSummaryInformation(String author, String applicationName,
070: long charCount, String comments, Date createdAt,
071: Date editTime, String keywords, String lastModifiedBy,
072: Date lastPrintedAt, String title, Date lastModifiedAt,
073: int pageCount, String revNumber, int wordCount,
074: boolean encrypted, String additionalText) {
075: this .author = author;
076: this .applicationName = applicationName == null ? ""
077: : applicationName;
078: this .charCount = charCount;
079: this .comments = comments == null ? "" : comments;
080: if (createdAt != null)
081: this .createdAt = (Date) createdAt.clone();
082: else
083: this .createdAt = new Date();
084: if (editTime != null)
085: this .editTime = (Date) editTime.clone();
086: else
087: this .editTime = new Date();
088: this .keywords = keywords == null ? "" : keywords;
089: this .lastModifiedBy = lastModifiedBy;
090: if (lastPrintedAt != null)
091: this .lastPrintedAt = (Date) lastPrintedAt.clone();
092: else
093: this .lastPrintedAt = new Date();
094: this .title = title == null ? "" : title;
095: if (lastModifiedAt != null)
096: this .lastModifiedAt = (Date) lastModifiedAt.clone();
097: else
098: this .lastModifiedAt = new Date();
099: this .pageCount = pageCount;
100: this .revNumber = revNumber == null ? "" : revNumber;
101: this .wordCount = wordCount;
102: this .encrypted = encrypted;
103: this .additionalText = additionalText == null ? ""
104: : additionalText;
105: }
106:
107: public String toString() {
108: return "author=" + this .author + ", " + "application="
109: + this .applicationName + ", " + "charCount:"
110: + this .charCount + ", " + "comments:" + this .comments
111: + ", " + "createdAt:" + this .createdAt + ", "
112: + "editTime:" + this .editTime + ", " + "keywords:"
113: + this .keywords + ", " + "lastModifiedBy:"
114: + this .lastModifiedBy + ", " + "lastPrintedAt:"
115: + this .lastPrintedAt + ", " + "title:" + this .title
116: + ", " + "lastModifiedAt:" + this .lastModifiedAt + ", "
117: + "pageCount:" + this .pageCount + ", " + "revNumber:"
118: + this .revNumber + ", " + "wordCount:" + this .wordCount
119: + ", " + "encrypted:" + this .encrypted;
120: }
121:
122: public String toXML() {
123: StringBuilder sb = new StringBuilder(1000);
124: sb.append("<summary>");
125: encodeXML(sb, "author", author);
126: encodeXML(sb, "applicationName", applicationName);
127: encodeXML(sb, "charCount", charCount);
128: encodeXML(sb, "comments", comments);
129: encodeXML(sb, "createdAt", createdAt);
130: encodeXML(sb, "editTime", editTime);
131: encodeXML(sb, "keywords", keywords);
132: encodeXML(sb, "lastModifiedBy", lastModifiedBy);
133: encodeXML(sb, "lastPrintedAt", lastPrintedAt);
134: encodeXML(sb, "title", title);
135: encodeXML(sb, "lastModifiedAt", lastModifiedAt);
136: encodeXML(sb, "pageCount", pageCount);
137: encodeXML(sb, "revNumber", revNumber);
138: encodeXML(sb, "wordCount", wordCount);
139: encodeXML(sb, "encrypted", encrypted);
140: encodeXML(sb, "additionalText", additionalText);
141: sb.append("</summary>");
142: return sb.toString();
143: }
144:
145: private void encodeXML(StringBuilder sb, String tag, Object data) {
146: if (data != null) {
147: sb.append("<").append(tag).append(">");
148: if (data instanceof String)
149: sb.append("<![CDATA[").append(data).append("]]>");
150: else if (data instanceof Date)
151: sb.append(((Date) data).getTime());
152: else
153: sb.append(data);
154: sb.append("</").append(tag).append(">");
155: }
156: }
157:
158: /**
159: * Constructor.
160: *
161: * @param si the summary information
162: */
163: public FxSummaryInformation(SummaryInformation si) {
164: author = si.getAuthor();
165: applicationName = si.getApplicationName();
166: charCount = si.getCharCount();
167: comments = si.getComments();
168: createdAt = si.getCreateDateTime();
169: editTime = si.getEditTime();
170: keywords = si.getKeywords();
171: lastModifiedBy = si.getLastAuthor();
172: lastPrintedAt = si.getLastPrinted();
173: title = si.getTitle();
174: lastModifiedAt = si.getLastSaveDateTime();
175: pageCount = si.getPageCount();
176: revNumber = si.getRevNumber();
177: wordCount = si.getWordCount();
178: encrypted = false;
179: }
180:
181: public FxSummaryInformation(final PDDocument pdf) {
182: final PDDocumentInformation pi = pdf.getDocumentInformation();
183: author = pi.getAuthor();
184: applicationName = pi.getProducer();
185: charCount = -1;
186: comments = "";
187: try {
188: createdAt = pi.getCreationDate().getTime();
189: } catch (Exception exc) {
190: createdAt = null;
191: }
192: try {
193: editTime = pi.getModificationDate().getTime();
194: lastModifiedAt = editTime;
195: } catch (Exception exc) {
196: editTime = null;
197: lastModifiedAt = null;
198: }
199: keywords = pi.getKeywords();
200: lastPrintedAt = null;
201: title = pi.getTitle();
202: pageCount = pdf.getNumberOfPages();
203: revNumber = "";
204: wordCount = -1;
205: encrypted = pdf.isEncrypted();
206: }
207:
208: /**
209: * Reads the summary information from a document.
210: *
211: * @param filename the file to read
212: * @return the summary information
213: */
214: public static FxSummaryInformation getSummaryInformation(
215: String filename) {
216: FileInputStream input = null;
217: try {
218: input = new FileInputStream(filename);
219: FxSummaryInformation result = getSummaryInformation(input);
220: input.close();
221: return result;
222: } catch (Exception ex) {
223: return null;
224: } finally {
225: if (input != null) {
226: try {
227: input.close();
228: } catch (Exception exc) {/**/
229: }
230: }
231: }
232: }
233:
234: /**
235: * Reads the summary information from a document.
236: *
237: * @param input the input stream to read from, will not be closed at the end
238: * @return the summary information
239: */
240: public static FxSummaryInformation getSummaryInformation(
241: InputStream input) {
242: class SummaryStore implements POIFSReaderListener {
243: private FxSummaryInformation fxsi = null;
244:
245: /**
246: * Proccesses the Summary section.
247: *
248: * @param event the summary section event.
249: */
250: public void processPOIFSReaderEvent(POIFSReaderEvent event) {
251: try {
252: SummaryInformation si = (SummaryInformation) PropertySetFactory
253: .create(event.getStream());
254: fxsi = new FxSummaryInformation(si);
255: } catch (Exception ex) {
256: /* ignore */
257: }
258: }
259:
260: protected FxSummaryInformation getFxSummaryInformation() {
261: return fxsi;
262: }
263: }
264: try {
265: POIFSReader reader = new POIFSReader();
266: SummaryStore st = new SummaryStore();
267: reader.registerListener(st, "\005SummaryInformation");
268: reader.read(input);
269: return st.getFxSummaryInformation();
270: } catch (Exception ex) {
271: return null;
272: }
273: }
274:
275: /**
276: * Returns a string that is appended to the text used by the fulltext indexer.
277: *
278: * @return a string that is appended to the text used by the fulltext indexer
279: */
280: public String getFTIndexInformations() {
281: StringBuffer sb = new StringBuffer(1024);
282: if (author != null && author.length() > 0) {
283: sb.append(author).append(" ");
284: }
285: if (applicationName != null && applicationName.length() > 0) {
286: sb.append(author).append(" ");
287: }
288: if (comments != null && comments.length() > 0) {
289: sb.append(author).append(" ");
290: }
291: if (keywords != null && keywords.length() > 0) {
292: sb.append(author).append(" ");
293: }
294: if (title != null && title.length() > 0) {
295: sb.append(author).append(" ");
296: }
297: if (revNumber != null && revNumber.length() > 0) {
298: sb.append(author).append(" ");
299: }
300: return sb.toString();
301: }
302:
303: public String getAuthor() {
304: return author;
305: }
306:
307: public String getApplicationName() {
308: return applicationName;
309: }
310:
311: public long getCharCount() {
312: return charCount;
313: }
314:
315: public String getComments() {
316: return comments;
317: }
318:
319: public Date getCreatedAt() {
320: return (Date) createdAt.clone();
321: }
322:
323: public Date getEditTime() {
324: return (Date) editTime.clone();
325: }
326:
327: public String getKeywords() {
328: return keywords;
329: }
330:
331: public String getLastModifiedBy() {
332: return lastModifiedBy;
333: }
334:
335: public Date getLastPrintedAt() {
336: return (Date) lastPrintedAt.clone();
337: }
338:
339: public String getTitle() {
340: return title;
341: }
342:
343: public Date getLastModifiedAt() {
344: return (Date) lastModifiedAt.clone();
345: }
346:
347: public int getPageCount() {
348: return pageCount;
349: }
350:
351: public String getRevNumber() {
352: return revNumber;
353: }
354:
355: public int getWordCount() {
356: return wordCount;
357: }
358:
359: public boolean isEncrypted() {
360: return encrypted;
361: }
362:
363: public String getAdditionalText() {
364: return additionalText;
365: }
366: }
|