001: /***************************************************************
002: * This file is part of the [fleXive](R) project.
003: *
004: * Copyright (c) 1999-2008
005: * UCS - unique computing solutions gmbh (http://www.ucs.at)
006: * All rights reserved
007: *
008: * The [fleXive](R) project is free software; you can redistribute
009: * it and/or modify it under the terms of the GNU General Public
010: * License as published by the Free Software Foundation;
011: * either version 2 of the License, or (at your option) any
012: * later version.
013: *
014: * The GNU General Public License can be found at
015: * http://www.gnu.org/copyleft/gpl.html.
016: * A copy is found in the textfile GPL.txt and important notices to the
017: * license from the author are found in LICENSE.txt distributed with
018: * these libraries.
019: *
020: * This library is distributed in the hope that it will be useful,
021: * but WITHOUT ANY WARRANTY; without even the implied warranty of
022: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
023: * GNU General Public License for more details.
024: *
025: * For further information about UCS - unique computing solutions gmbh,
026: * please see the company website: http://www.ucs.at
027: *
028: * For further information about [fleXive](R), please see the
029: * project website: http://www.flexive.org
030: *
031: *
032: * This copyright notice MUST APPEAR in all copies of the file!
033: ***************************************************************/package com.flexive.extractor;
034:
035: import org.apache.poi.hdf.extractor.WordDocument;
036: import org.apache.poi.hpsf.PropertySetFactory;
037: import org.apache.poi.hpsf.SummaryInformation;
038: import org.apache.poi.poifs.eventfilesystem.POIFSReader;
039: import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
040: import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
041:
042: import java.io.BufferedInputStream;
043: import java.io.ByteArrayOutputStream;
044: import java.io.InputStream;
045: import java.io.PrintWriter;
046:
047: class WordExtractor implements POIFSReaderListener {
048:
049: private FxSummaryInformation fxsi = null;
050:
051: /**
052: * Proccesses the Summary section.
053: *
054: * @param event the summary section event.
055: */
056: public void processPOIFSReaderEvent(POIFSReaderEvent event) {
057: try {
058: SummaryInformation si = (SummaryInformation) PropertySetFactory
059: .create(event.getStream());
060: fxsi = new FxSummaryInformation(si);
061: } catch (Exception ex) {
062: //
063: }
064: }
065:
066: /**
067: * Extracts the text informations from the word file.
068: *
069: * @param in the input stream to read from
070: * @return the extraxted informations, or null if no text extraction was possible
071: */
072: public ExtractedData extract(final InputStream in) {
073: ByteArrayOutputStream baos = null;
074: PrintWriter writer = null;
075: BufferedInputStream bis = null;
076: try {
077:
078: baos = new ByteArrayOutputStream();
079: writer = new PrintWriter(baos);
080:
081: // We need to read the stream 2 times, so we use a buffered input stream and mark the
082: // beginning
083: bis = new BufferedInputStream(in);
084: bis.mark(Integer.MAX_VALUE);
085:
086: // Retrieve summary information
087: POIFSReader r = new POIFSReader();
088: r.registerListener(this , "\005SummaryInformation");
089: r.read(bis);
090: bis.reset();
091:
092: // Retrieve text
093: WordDocument wd = new WordDocument(bis);
094: wd.writeAllText(writer);
095: if (fxsi != null) {
096: writer.write(fxsi.getFTIndexInformations());
097: }
098: writer.flush();
099:
100: return new ExtractedData(fxsi, baos.toString());
101: } catch (Exception exc) {
102: return null;
103: } finally {
104: try {
105: if (writer != null)
106: writer.close();
107: } catch (Exception exc) {/*ignore*/
108: }
109: try {
110: if (baos != null)
111: baos.close();
112: } catch (Exception exc) {/*ignore*/
113: }
114: try {
115: if (bis != null)
116: bis.close();
117: } catch (Exception exc) {/*ignore*/
118: }
119: }
120: }
121:
122: }
|