001: /*
002:
003: * LIUS - Lucene Index Update and Search
004: * http://sourceforge.net/projects/lius/
005: *
006: * Copyright (c) 2005, Laval University Library. All rights reserved.
007: *
008: * This library is free software; you can redistribute it and/or
009: * modify it under the terms of the GNU Lesser General Public
010: * License as published by the Free Software Foundation; either
011: * version 2.1 of the License, or (at your option) any later version.
012: *
013: * This library is distributed in the hope that it will be useful,
014: * but WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * Lesser General Public License for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public
019: * License along with this library; if not, write to the Free Software
020: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
021: */
022:
023: package ca.ulaval.bibl.lius.index.PDF;
024:
025: import java.io.File;
026: import java.io.FileInputStream;
027: import java.io.IOException;
028: import java.io.StringWriter;
029: import java.util.ArrayList;
030: import java.util.Collection;
031: import java.util.Iterator;
032:
033: import org.apache.log4j.Logger;
034: import org.apache.lucene.document.Document;
035: import org.pdfbox.exceptions.CryptographyException;
036: import org.pdfbox.exceptions.InvalidPasswordException;
037: import org.pdfbox.pdmodel.PDDocument;
038: import org.pdfbox.pdmodel.PDDocumentInformation;
039: import org.pdfbox.util.PDFTextStripper;
040:
041: import ca.ulaval.bibl.lius.Lucene.LuceneActions;
042: import ca.ulaval.bibl.lius.config.LiusConfig;
043: import ca.ulaval.bibl.lius.config.LiusConfigBuilder;
044: import ca.ulaval.bibl.lius.config.LiusField;
045: import ca.ulaval.bibl.lius.index.Indexer;
046:
047: /**
048: *
049: * Classe permettant d'indexer des fichiers PDF basée sur PDFBox et inspéré de
050: * la classe LucenePDFDocument
051: *
052: * <br/><br/>
053: *
054: * Class for indexing PDF documents, based on PDFBox and inspired from
055: * LucenePDFDocument.
056: *
057: * @author Rida Benjelloun (rida.benjelloun@bibl.ulaval.ca)
058: *
059: */
060:
061: public class PdfIndexer
062:
063: extends Indexer {
064:
065: static Logger logger = Logger.getRootLogger();
066:
067: private PDDocument pdfDocument = null;
068:
069: public Object parse(Object file) {
070:
071: String contents = "";
072:
073: try {
074:
075: FileInputStream input = new FileInputStream(new File(
076: (String) file));
077:
078: pdfDocument = PDDocument.load(input);
079:
080: if (pdfDocument.isEncrypted()) {
081:
082: pdfDocument.decrypt("");
083:
084: }
085:
086: StringWriter writer = new StringWriter();
087:
088: PDFTextStripper stripper = new PDFTextStripper();
089:
090: stripper.writeText(pdfDocument, writer);
091:
092: contents = writer.getBuffer().toString();
093:
094: }
095:
096: catch (CryptographyException e) {
097:
098: logger.error(e.getMessage());
099:
100: }
101:
102: catch (IOException e) {
103:
104: logger.error(e.getMessage());
105:
106: }
107:
108: catch (InvalidPasswordException e) {
109:
110: logger.error(e.getMessage());
111:
112: }
113:
114: finally {
115:
116: if (pdfDocument != null) {
117:
118: try {
119:
120: pdfDocument.close();
121:
122: }
123:
124: catch (IOException ex) {
125:
126: logger.error(ex.getMessage());
127:
128: }
129:
130: }
131:
132: }
133:
134: return contents;
135:
136: }
137:
138: /**
139: *
140: * Méthode retournant un objet de type Lucene document à partir du fichier
141: *
142: * à indexer et du fichier de configuration de Lius exprimé sous forme
143: *
144: * d'objet de type LiusConfig.
145: *
146: * <br/><br/>
147: *
148: * Method that returns a Lucene Document object from a file to index and
149: *
150: * the Lius Configuration as a LiusConfig object.
151: *
152: */
153:
154: public Document createLuceneDocument(String file, LiusConfig lc) {
155:
156: Document doc = createLuceneDocument(file, lc.getPdfFields());
157:
158: return doc;
159:
160: }
161:
162: /**
163: *
164: * Permet de récupérer les champs de Lius à partir du fichier de
165: * configuration
166: *
167: * pour effectuer l'indexation.
168: *
169: * <br/><br/>
170: *
171: * Gets Lius fields from the configuration file for indexation.
172: *
173: */
174:
175: public Collection getLiusFields(LiusConfig lc) {
176:
177: return lc.getPdfFields();
178:
179: }
180:
181: /**
182: *
183: * Méthode retournant un objet de type Lucene document à partir du fichier à
184: *
185: * indexer et d'une collection d'objets de type LiusField. Chaque objet
186: *
187: * LiusField contient de l'information sur le nom du champs Lucene, le type,
188: *
189: * etc.
190: *
191: * <br/><br/>
192: *
193: * Method that returns a Lucene object from the configuration file and a
194: * collection
195: *
196: * of LiusField objects. Each LiusField object contains information about
197: * the Lucene
198: *
199: * field, the type, etc.
200: *
201: */
202:
203: public Collection getPopulatedCollection(Object file,
204: Collection liusFields) {
205:
206: Collection coll = new ArrayList();
207:
208: String contents = (String) parse(file);
209:
210: LuceneActions la = LuceneActions.getSingletonInstance();
211:
212: Iterator i = liusFields.iterator();
213:
214: while (i.hasNext()) {
215:
216: Object field = i.next();
217:
218: if (field instanceof LiusField) {
219:
220: LiusField lf = (LiusField) field;
221:
222: if (lf.getGet() != null) {
223:
224: if (lf.getGet().equalsIgnoreCase("content")) {
225:
226: lf.setValue(contents);
227:
228: coll.add(lf);
229:
230: }
231:
232: else {
233:
234: PDDocumentInformation metaData = pdfDocument.
235:
236: getDocumentInformation();
237:
238: if (lf.getGet().equalsIgnoreCase("title")) {
239:
240: if (metaData.getTitle() != null) {
241:
242: lf.setValue(metaData.getTitle());
243:
244: coll.add(lf);
245:
246: }
247:
248: }
249:
250: else if (lf.getGet().equalsIgnoreCase("author")) {
251:
252: if (metaData.getAuthor() != null) {
253:
254: lf.setValue(metaData.getAuthor());
255:
256: coll.add(lf);
257:
258: }
259:
260: }
261:
262: else if (lf.getGet()
263: .equalsIgnoreCase("creator")) {
264:
265: if (metaData.getCreator() != null) {
266:
267: lf.setValue(metaData.getCreator());
268:
269: coll.add(lf);
270:
271: }
272:
273: }
274:
275: else if (lf.getGet().equalsIgnoreCase(
276: "keywords")) {
277:
278: if (metaData.getKeywords() != null) {
279:
280: lf.setValue(metaData.getKeywords());
281:
282: coll.add(lf);
283:
284: }
285:
286: }
287:
288: else if (lf.getGet().equalsIgnoreCase(
289: "producer")) {
290:
291: if (metaData.getProducer() != null) {
292:
293: lf.setValue(metaData.getProducer());
294:
295: coll.add(lf);
296:
297: }
298:
299: }
300:
301: else if (lf.getGet()
302: .equalsIgnoreCase("subject")) {
303:
304: if (metaData.getSubject() != null) {
305:
306: lf.setValue(metaData.getSubject());
307:
308: coll.add(lf);
309:
310: }
311:
312: }
313:
314: else if (lf.getGet()
315: .equalsIgnoreCase("trapped")) {
316:
317: if (metaData.getTrapped() != null) {
318:
319: lf.setValue(metaData.getTrapped());
320:
321: coll.add(lf);
322:
323: }
324:
325: }
326:
327: else if (lf.getGet().equalsIgnoreCase(
328: "creationDate")) {
329:
330: if (metaData.getCreationDate() != null) {
331:
332: lf.setDate(metaData.getCreationDate()
333: .getTime());
334:
335: coll.add(lf);
336:
337: }
338:
339: }
340:
341: else if (lf.getGet().equalsIgnoreCase(
342: "modificationDate")) {
343:
344: if (metaData.getModificationDate() != null) {
345:
346: lf.setDate(metaData
347: .getModificationDate()
348: .getTime());
349:
350: coll.add(lf);
351:
352: }
353:
354: }
355:
356: else if (lf.getGet()
357: .equalsIgnoreCase("summary")) {
358:
359: int summarySize = Math.min(contents
360: .length(), 500);
361:
362: String summary = contents.substring(0,
363: summarySize);
364:
365: lf.setValue(summary);
366:
367: coll.add(lf);
368:
369: }
370:
371: }
372:
373: }
374:
375: }
376:
377: else {
378:
379: coll.add(field);
380:
381: }
382:
383: }
384:
385: return coll;
386:
387: }
388:
389: public Collection getPopulatedCollection(Object file,
390: String liusConfig) {
391:
392: LiusConfig lc = LiusConfigBuilder.getSingletonInstance()
393: .getLiusConfig(
394:
395: liusConfig);
396:
397: return getPopulatedCollection(file, lc);
398:
399: }
400:
401: public Collection getPopulatedCollection(Object file, LiusConfig lc) {
402:
403: return getPopulatedCollection(file, lc.getPdfFields());
404:
405: }
406:
407: }
|