001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017: package org.apache.poi.hssf.extractor;
018:
019: import java.io.IOException;
020:
021: import org.apache.poi.POITextExtractor;
022: import org.apache.poi.hssf.usermodel.HSSFCell;
023: import org.apache.poi.hssf.usermodel.HSSFRichTextString;
024: import org.apache.poi.hssf.usermodel.HSSFRow;
025: import org.apache.poi.hssf.usermodel.HSSFSheet;
026: import org.apache.poi.hssf.usermodel.HSSFWorkbook;
027: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
028:
029: /**
030: * A text extractor for Excel files.
031: * Returns the textual content of the file, suitable for
032: * indexing by something like Lucene, but not really
033: * intended for display to the user.
034: * To turn an excel file into a CSV or similar, then see
035: * the XLS2CSVmra example
036: * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
037: */
038: public class ExcelExtractor extends POITextExtractor {
039: private HSSFWorkbook wb;
040: private boolean includeSheetNames = true;
041: private boolean formulasNotResults = false;
042:
043: public ExcelExtractor(HSSFWorkbook wb) {
044: super (wb);
045: this .wb = wb;
046: }
047:
048: public ExcelExtractor(POIFSFileSystem fs) throws IOException {
049: this (new HSSFWorkbook(fs));
050: }
051:
052: /**
053: * Should sheet names be included? Default is true
054: */
055: public void setIncludeSheetNames(boolean includeSheetNames) {
056: this .includeSheetNames = includeSheetNames;
057: }
058:
059: /**
060: * Should we return the formula itself, and not
061: * the result it produces? Default is false
062: */
063: public void setFormulasNotResults(boolean formulasNotResults) {
064: this .formulasNotResults = formulasNotResults;
065: }
066:
067: /**
068: * Retreives the text contents of the file
069: */
070: public String getText() {
071: StringBuffer text = new StringBuffer();
072:
073: for (int i = 0; i < wb.getNumberOfSheets(); i++) {
074: HSSFSheet sheet = wb.getSheetAt(i);
075: if (sheet == null) {
076: continue;
077: }
078:
079: if (includeSheetNames) {
080: String name = wb.getSheetName(i);
081: if (name != null) {
082: text.append(name);
083: text.append("\n");
084: }
085: }
086:
087: int firstRow = sheet.getFirstRowNum();
088: int lastRow = sheet.getLastRowNum();
089: for (int j = firstRow; j <= lastRow; j++) {
090: HSSFRow row = sheet.getRow(j);
091: if (row == null) {
092: continue;
093: }
094:
095: // Check each cell in turn
096: int firstCell = row.getFirstCellNum();
097: int lastCell = row.getLastCellNum();
098: for (int k = firstCell; k < lastCell; k++) {
099: HSSFCell cell = row.getCell((short) k);
100: boolean outputContents = false;
101: if (cell == null) {
102: continue;
103: }
104:
105: switch (cell.getCellType()) {
106: case HSSFCell.CELL_TYPE_STRING:
107: text.append(cell.getRichStringCellValue()
108: .getString());
109: outputContents = true;
110: break;
111: case HSSFCell.CELL_TYPE_NUMERIC:
112: // Note - we don't apply any formatting!
113: text.append(cell.getNumericCellValue());
114: outputContents = true;
115: break;
116: case HSSFCell.CELL_TYPE_BOOLEAN:
117: text.append(cell.getBooleanCellValue());
118: outputContents = true;
119: break;
120: case HSSFCell.CELL_TYPE_FORMULA:
121: if (formulasNotResults) {
122: text.append(cell.getCellFormula());
123: } else {
124: // Try it as a string, if not as a number
125: HSSFRichTextString str = cell
126: .getRichStringCellValue();
127: if (str != null && str.length() > 0) {
128: text.append(str.toString());
129: } else {
130: // Try and treat it as a number
131: double val = cell.getNumericCellValue();
132: text.append(val);
133: }
134: }
135: outputContents = true;
136: break;
137: }
138:
139: // Output a tab if we're not on the last cell
140: if (outputContents && k < (lastCell - 1)) {
141: text.append("\t");
142: }
143: }
144:
145: // Finish off the row
146: text.append("\n");
147: }
148: }
149:
150: return text.toString();
151: }
152: }
|