001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/search/extractors/I_CmsTextExtractor.java,v $
003: * Date : $Date: 2008-02-27 12:05:31 $
004: * Version: $Revision: 1.7 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.search.extractors;
033:
034: import java.io.InputStream;
035:
036: /**
037: * Allows extraction of the indexable "plain" text plus (optional) meta information from a given binary
038: * input document format.<p>
039: *
040: * @author Alexander Kandzior
041: *
042: * @version $Revision: 1.7 $
043: *
044: * @since 6.0.0
045: */
046: public interface I_CmsTextExtractor {
047:
048: /**
049: * Extracts the text and meta information from the given binary document.<p>
050: *
051: * The encoding of the input stream is either not required (the document type may have
052: * one common default encoding) or the extractor is able to divine the encoding
053: * from the provided binary array automatically.<p>
054: *
055: * Delivers is the same result as calling <code>{@link #extractText(byte[], String)}</code>
056: * when <code>String == null</code>.<p>
057: *
058: * @param content the binary content of the document to extract the text from
059: * @return the extracted text
060: *
061: * @throws Exception if the text extration fails
062: */
063: I_CmsExtractionResult extractText(byte[] content) throws Exception;
064:
065: /**
066: * Extracts the text and meta information from the given binary document, using the specified content encoding.<p>
067: *
068: * The encoding is a hint for the text extractor, if the value given is <code>null</code> then
069: * the text extractor should try to figure out the encoding itself.<p>
070: *
071: * @param content the binary content of the document to extract the text from
072: * @param encoding the encoding to use
073: *
074: * @return the extracted text
075: *
076: * @throws Exception if the text extration fails
077: */
078: I_CmsExtractionResult extractText(byte[] content, String encoding)
079: throws Exception;
080:
081: /**
082: * Extracts the text and meta information from the document on the input stream.<p>
083: *
084: * The encoding of the input stream is either not required (the document type may have
085: * one common default encoding) or the extractor is able to divine the encoding
086: * from the provided input stream automatically.<p>
087: *
088: * Delivers is the same result as calling <code>{@link #extractText(InputStream, String)}</code>
089: * when <code>String == null</code>.<p>
090: *
091: * @param in the input stream for the document to extract the text from
092: * @return the extracted text and meta information
093: *
094: * @throws Exception if the text extration fails
095: */
096: I_CmsExtractionResult extractText(InputStream in) throws Exception;
097:
098: /**
099: * Extracts the text and meta information from the document on the input stream, using the specified content encoding.<p>
100: *
101: * The encoding is a hint for the text extractor, if the value given is <code>null</code> then
102: * the text extractor should try to figure out the encoding itself.<p>
103: *
104: * @param in the input stream for the document to extract the text from
105: * @param encoding the encoding to use
106: *
107: * @return the extracted text and meta information
108: *
109: * @throws Exception if the text extration fails
110: */
111: I_CmsExtractionResult extractText(InputStream in, String encoding)
112: throws Exception;
113: }
|