001: /*
002: * Copyright 2003-2004 Michael Franken, Zilverline.
003: *
004: * The contents of this file, or the files included with this file, are subject to
005: * the current version of ZILVERLINE Collaborative Source License for the
006: * Zilverline Search Engine (the "License"); You may not use this file except in
007: * compliance with the License.
008: *
009: * You may obtain a copy of the License at
010: *
011: * http://www.zilverline.org.
012: *
013: * See the License for the rights, obligations and
014: * limitations governing use of the contents of the file.
015: *
016: * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
017: * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
018: * copyrights in the portions it created. All Rights Reserved.
019: *
020: */
021:
022: package org.zilverline.extractors;
023:
024: import java.io.CharArrayReader;
025: import java.io.CharArrayWriter;
026: import java.io.File;
027: import java.io.FileInputStream;
028: import java.io.InputStream;
029: import java.io.Reader;
030: import java.util.Iterator;
031:
032: import org.apache.poi.hssf.usermodel.HSSFCell;
033: import org.apache.poi.hssf.usermodel.HSSFRow;
034: import org.apache.poi.hssf.usermodel.HSSFSheet;
035: import org.apache.poi.hssf.usermodel.HSSFWorkbook;
036: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
037:
038: /**
039: * This class extracts text from MS Excel files by using the POI library.
040: *
041: * @author Michael Franken
042: * @version $Revision: 1.6 $
043: */
044: public class ExcelExtractor extends AbstractExtractor {
045: /**
046: * Extract the content from the given Excel file. As a side effect the type is set too.
047: *
048: * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
049: */
050: public final Reader getContent(final File f) {
051: Reader reader = null;
052:
053: setType("EXCEL");
054:
055: try {
056: CharArrayWriter writer = new CharArrayWriter();
057:
058: POIFSFileSystem fs = new POIFSFileSystem(
059: new FileInputStream(f));
060: HSSFWorkbook workbook = new HSSFWorkbook(fs);
061:
062: for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
063: HSSFSheet sheet = workbook.getSheetAt(i);
064:
065: Iterator rows = sheet.rowIterator();
066: while (rows.hasNext()) {
067: HSSFRow row = (HSSFRow) rows.next();
068:
069: Iterator cells = row.cellIterator();
070: while (cells.hasNext()) {
071: HSSFCell cell = (HSSFCell) cells.next();
072: switch (cell.getCellType()) {
073: case HSSFCell.CELL_TYPE_NUMERIC:
074: String num = Double.toString(
075: cell.getNumericCellValue()).trim();
076: if (num.length() > 0) {
077: writer.write(num + " ");
078: }
079: break;
080: case HSSFCell.CELL_TYPE_STRING:
081: String text = cell.getStringCellValue()
082: .trim();
083: if (text.length() > 0) {
084: writer.write(text + " ");
085: }
086: break;
087: default: // skip
088: }
089: }
090: }
091: }
092: setSummary(getSummaryFromContent(writer.toString()));
093:
094: return new CharArrayReader(writer.toCharArray());
095: } catch (Exception e) {
096: log.warn("Can't extract contents for: " + f.getName(), e);
097: }
098:
099: return reader;
100: }
101:
102: /**
103: * Extract the content from the given Excel file. As a side effect the type is set too.
104: *
105: * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
106: */
107: public final String getContent(final InputStream is) {
108: try {
109: CharArrayWriter writer = new CharArrayWriter();
110:
111: POIFSFileSystem fs = new POIFSFileSystem(is);
112: HSSFWorkbook workbook = new HSSFWorkbook(fs);
113:
114: for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
115: HSSFSheet sheet = workbook.getSheetAt(i);
116:
117: Iterator rows = sheet.rowIterator();
118: while (rows.hasNext()) {
119: HSSFRow row = (HSSFRow) rows.next();
120:
121: Iterator cells = row.cellIterator();
122: while (cells.hasNext()) {
123: HSSFCell cell = (HSSFCell) cells.next();
124: switch (cell.getCellType()) {
125: case HSSFCell.CELL_TYPE_NUMERIC:
126: String num = Double.toString(
127: cell.getNumericCellValue()).trim();
128: if (num.length() > 0) {
129: writer.write(num + " ");
130: }
131: break;
132: case HSSFCell.CELL_TYPE_STRING:
133: String text = cell.getStringCellValue()
134: .trim();
135: if (text.length() > 0) {
136: writer.write(text + " ");
137: }
138: break;
139: default: // skip
140: }
141: }
142: }
143: }
144:
145: return new String(writer.toCharArray());
146: } catch (Exception e) {
147: log.warn("Can't extract contents", e);
148: }
149:
150: return "";
151: }
152: }
|