01: /***************************************************************
02: * This file is part of the [fleXive](R) project.
03: *
04: * Copyright (c) 1999-2008
05: * UCS - unique computing solutions gmbh (http://www.ucs.at)
06: * All rights reserved
07: *
08: * The [fleXive](R) project is free software; you can redistribute
09: * it and/or modify it under the terms of the GNU General Public
10: * License as published by the Free Software Foundation;
11: * either version 2 of the License, or (at your option) any
12: * later version.
13: *
14: * The GNU General Public License can be found at
15: * http://www.gnu.org/copyleft/gpl.html.
16: * A copy is found in the textfile GPL.txt and important notices to the
17: * license from the author are found in LICENSE.txt distributed with
18: * these libraries.
19: *
20: * This library is distributed in the hope that it will be useful,
21: * but WITHOUT ANY WARRANTY; without even the implied warranty of
22: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23: * GNU General Public License for more details.
24: *
25: * For further information about UCS - unique computing solutions gmbh,
26: * please see the company website: http://www.ucs.at
27: *
28: * For further information about [fleXive](R), please see the
29: * project website: http://www.flexive.org
30: *
31: *
32: * This copyright notice MUST APPEAR in all copies of the file!
33: ***************************************************************/package com.flexive.extractor;
34:
35: import org.pdfbox.pdmodel.PDDocument;
36: import org.pdfbox.util.PDFTextStripper;
37:
38: import java.io.ByteArrayOutputStream;
39: import java.io.InputStream;
40: import java.io.PrintWriter;
41:
42: public class PdfExtractor {
43:
44: /**
45: * Extracts the text informations from the pdf file.
46: *
47: * @param in the input stream to read from
48: * @return the extraxted informations, or null if no text extraction was possible
49: */
50: public ExtractedData extract(final InputStream in) {
51: ByteArrayOutputStream baos = null;
52: PrintWriter writer = null;
53: PDDocument document = null;
54: try {
55: baos = new ByteArrayOutputStream();
56: writer = new PrintWriter(baos);
57: document = PDDocument.load(in);
58: PDFTextStripper stripper = new PDFTextStripper();
59: stripper.writeText(document, writer);
60: FxSummaryInformation fxsi = new FxSummaryInformation(
61: document);
62: writer.write(fxsi.getFTIndexInformations());
63: writer.flush();
64: return new ExtractedData(fxsi, baos.toString());
65: } catch (Exception exc) {
66: exc.printStackTrace();
67: return null;
68: } finally {
69: try {
70: if (writer != null)
71: writer.close();
72: } catch (Exception exc) {/*ignore*/
73: }
74: try {
75: if (baos != null)
76: baos.close();
77: } catch (Exception exc) {/*ignore*/
78: }
79: try {
80: if (document != null)
81: document.close();
82: } catch (Exception exc) {/*ignore*/
83: }
84: }
85:
86: }
87:
88: }
|