01: /*
02: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/extractors/CmsExtractorMsWord.java,v $
03: * Date : $Date: 2008-02-27 12:05:31 $
04: * Version: $Revision: 1.12 $
05: *
06: * This library is part of OpenCms -
07: * the Open Source Content Management System
08: *
09: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
10: *
11: * This library is free software; you can redistribute it and/or
12: * modify it under the terms of the GNU Lesser General Public
13: * License as published by the Free Software Foundation; either
14: * version 2.1 of the License, or (at your option) any later version.
15: *
16: * This library is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19: * Lesser General Public License for more details.
20: *
21: * For further information about Alkacon Software GmbH, please see the
22: * company website: http://www.alkacon.com
23: *
24: * For further information about OpenCms, please see the
25: * project website: http://www.opencms.org
26: *
27: * You should have received a copy of the GNU Lesser General Public
28: * License along with this library; if not, write to the Free Software
29: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30: */
31:
32: package org.opencms.search.extractors;
33:
34: import java.io.InputStream;
35:
36: import org.apache.poi.poifs.eventfilesystem.POIFSReader;
37:
38: import org.textmining.text.extraction.WordExtractor;
39:
40: /**
41: * Extracts the text from an MS Word document.<p>
42: *
43: * @author Alexander Kandzior
44: *
45: * @version $Revision: 1.12 $
46: *
47: * @since 6.0.0
48: */
49: public final class CmsExtractorMsWord extends
50: A_CmsTextExtractorMsOfficeBase {
51:
52: /** Static member instance of the extractor. */
53: private static final CmsExtractorMsWord INSTANCE = new CmsExtractorMsWord();
54:
55: /**
56: * Hide the public constructor.<p>
57: */
58: private CmsExtractorMsWord() {
59:
60: // noop
61: }
62:
63: /**
64: * Returns an instance of this text extractor.<p>
65: *
66: * @return an instance of this text extractor
67: */
68: public static I_CmsTextExtractor getExtractor() {
69:
70: return INSTANCE;
71: }
72:
73: /**
74: * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
75: */
76: public I_CmsExtractionResult extractText(InputStream in,
77: String encoding) throws Exception {
78:
79: // first extract the text using the text abstraction libary
80: WordExtractor wordExtractor = new WordExtractor();
81: String rawContent = wordExtractor
82: .extractText(getStreamCopy(in));
83: rawContent = removeControlChars(rawContent);
84:
85: // now extract the meta information using POI
86: POIFSReader reader = new POIFSReader();
87: reader.registerListener(this );
88: reader.read(getStreamCopy(in));
89:
90: // combine the meta information with the content and create the result
91: return createExtractionResult(rawContent);
92: }
93: }
|