001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/extractors/CmsExtractorMsExcel.java,v $
003: * Date : $Date: 2008-02-27 12:05:30 $
004: * Version: $Revision: 1.12 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.search.extractors;
033:
034: import org.opencms.util.CmsStringUtil;
035:
036: import java.io.IOException;
037: import java.io.InputStream;
038: import java.util.Iterator;
039:
040: import org.apache.poi.hssf.usermodel.HSSFCell;
041: import org.apache.poi.hssf.usermodel.HSSFRow;
042: import org.apache.poi.hssf.usermodel.HSSFSheet;
043: import org.apache.poi.hssf.usermodel.HSSFWorkbook;
044: import org.apache.poi.poifs.eventfilesystem.POIFSReader;
045:
046: /**
047: * Extracts the text from an MS Excel document.<p>
048: *
049: * @author Alexander Kandzior
050: *
051: * @version $Revision: 1.12 $
052: *
053: * @since 6.0.0
054: */
055: public final class CmsExtractorMsExcel extends
056: A_CmsTextExtractorMsOfficeBase {
057:
058: /** Static member instance of the extractor. */
059: private static final CmsExtractorMsExcel INSTANCE = new CmsExtractorMsExcel();
060:
061: /**
062: * Hide the public constructor.<p>
063: */
064: private CmsExtractorMsExcel() {
065:
066: // noop
067: }
068:
069: /**
070: * Returns an instance of this text extractor.<p>
071: *
072: * @return an instance of this text extractor
073: */
074: public static I_CmsTextExtractor getExtractor() {
075:
076: return INSTANCE;
077: }
078:
079: /**
080: * @see org.opencms.search.extractors.I_CmsTextExtractor#extractText(java.io.InputStream, java.lang.String)
081: */
082: public I_CmsExtractionResult extractText(InputStream in,
083: String encoding) throws Exception {
084:
085: // first extract the table content
086: String rawContent = extractTableContent(getStreamCopy(in));
087: rawContent = removeControlChars(rawContent);
088:
089: // now extract the meta information using POI
090: POIFSReader reader = new POIFSReader();
091: reader.registerListener(this );
092: reader.read(getStreamCopy(in));
093:
094: // combine the meta information with the content and create the result
095: return createExtractionResult(rawContent);
096: }
097:
098: /**
099: * Extracts the text from the Excel table content.<p>
100: *
101: * @param in the document input stream
102: * @return the extracted text
103: * @throws IOException if something goes wring
104: */
105: protected String extractTableContent(InputStream in)
106: throws IOException {
107:
108: HSSFWorkbook excelWb = new HSSFWorkbook(in);
109: StringBuffer result = new StringBuffer(4096);
110:
111: int numberOfSheets = excelWb.getNumberOfSheets();
112:
113: for (int i = 0; i < numberOfSheets; i++) {
114: HSSFSheet sheet = excelWb.getSheetAt(i);
115: int numberOfRows = sheet.getPhysicalNumberOfRows();
116: if (numberOfRows > 0) {
117:
118: if (CmsStringUtil.isNotEmpty(excelWb.getSheetName(i))) {
119: // append sheet name to content
120: if (i > 0) {
121: result.append("\n\n");
122: }
123: result.append(excelWb.getSheetName(i).trim());
124: result.append(":\n\n");
125: }
126:
127: Iterator rowIt = sheet.rowIterator();
128: while (rowIt.hasNext()) {
129: HSSFRow row = (HSSFRow) rowIt.next();
130: if (row != null) {
131: boolean hasContent = false;
132: Iterator it = row.cellIterator();
133: while (it.hasNext()) {
134: HSSFCell cell = (HSSFCell) it.next();
135: String text = null;
136: try {
137: switch (cell.getCellType()) {
138: case HSSFCell.CELL_TYPE_BLANK:
139: case HSSFCell.CELL_TYPE_ERROR:
140: // ignore all blank or error cells
141: break;
142: case HSSFCell.CELL_TYPE_NUMERIC:
143: text = Double.toString(cell
144: .getNumericCellValue());
145: break;
146: case HSSFCell.CELL_TYPE_BOOLEAN:
147: text = Boolean.toString(cell
148: .getBooleanCellValue());
149: break;
150: case HSSFCell.CELL_TYPE_STRING:
151: default:
152: text = cell.getStringCellValue();
153: break;
154: }
155: } catch (Exception e) {
156: // ignore this cell
157: }
158: if ((text != null) && (text.length() != 0)) {
159: result.append(text.trim());
160: result.append(' ');
161: hasContent = true;
162: }
163: }
164: if (hasContent) {
165: // append a newline at the end of each row that has content
166: result.append('\n');
167: }
168: }
169: }
170: }
171: }
172:
173: return result.toString();
174: }
175: }
|