001: /***************************************************************
002: * This file is part of the [fleXive](R) project.
003: *
004: * Copyright (c) 1999-2008
005: * UCS - unique computing solutions gmbh (http://www.ucs.at)
006: * All rights reserved
007: *
008: * The [fleXive](R) project is free software; you can redistribute
009: * it and/or modify it under the terms of the GNU General Public
010: * License as published by the Free Software Foundation;
011: * either version 2 of the License, or (at your option) any
012: * later version.
013: *
014: * The GNU General Public License can be found at
015: * http://www.gnu.org/copyleft/gpl.html.
016: * A copy is found in the textfile GPL.txt and important notices to the
017: * license from the author are found in LICENSE.txt distributed with
018: * these libraries.
019: *
020: * This library is distributed in the hope that it will be useful,
021: * but WITHOUT ANY WARRANTY; without even the implied warranty of
022: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
023: * GNU General Public License for more details.
024: *
025: * For further information about UCS - unique computing solutions gmbh,
026: * please see the company website: http://www.ucs.at
027: *
028: * For further information about [fleXive](R), please see the
029: * project website: http://www.flexive.org
030: *
031: *
032: * This copyright notice MUST APPEAR in all copies of the file!
033: ***************************************************************/package com.flexive.extractor;
034:
035: import java.io.File;
036: import java.io.FileInputStream;
037: import java.io.InputStream;
038:
039: public class Extractor {
040:
041: public static enum DocumentType {
042: Word, Excel, PDF, Powerpoint, HTML
043: }
044:
045: /**
046: * Extracts data from a given file.
047: *
048: * @param filename the filename
049: * @param type the type of the document
050: * @return the extracted data
051: */
052: public static ExtractedData extractData(final String filename,
053: final DocumentType type) {
054: FileInputStream input = null;
055: try {
056: input = new FileInputStream(filename);
057: return extractData(input, type);
058: } catch (Exception ex) {
059: return null;
060: } finally {
061: try {
062: if (input != null)
063: input.close();
064: } catch (Exception exc) {/*ignore*/
065: }
066: }
067: }
068:
069: /**
070: * Extracts data from a given file.
071: *
072: * @param file the file
073: * @param type the type of the document
074: * @return the extracted data
075: */
076: public static ExtractedData extractData(final File file,
077: final DocumentType type) {
078: FileInputStream input = null;
079: try {
080: input = new FileInputStream(file);
081: return extractData(input, type);
082: } catch (Exception ex) {
083: return null;
084: } finally {
085: try {
086: if (input != null)
087: input.close();
088: } catch (Exception exc) {/*ignore*/
089: }
090: }
091: }
092:
093: /**
094: * Extracts data from a given input stream.
095: *
096: * @param in the input stream to read from, it is not closed at the end
097: * @param type the type of the document
098: * @return the extracted data
099: */
100: public static ExtractedData extractData(final InputStream in,
101: final DocumentType type) {
102: switch (type) {
103: case Word:
104: return new WordExtractor().extract(in);
105: case Powerpoint:
106: return new PowerpointExtractor().extract(in);
107: case Excel:
108: return new ExcelExtractor().extract(in);
109: case PDF:
110: return new PdfExtractor().extract(in);
111: case HTML:
112: return HtmlExtractor.extract(in);
113: default:
114: return null;
115: }
116: }
117: }
|