001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/util/CmsHtmlExtractor.java,v $
003: * Date : $Date: 2008-02-27 12:05:36 $
004: * Version: $Revision: 1.13 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.util;
033:
034: import org.opencms.staticexport.CmsLinkProcessor;
035:
036: import java.io.ByteArrayInputStream;
037: import java.io.InputStream;
038: import java.io.UnsupportedEncodingException;
039:
040: import org.htmlparser.Parser;
041: import org.htmlparser.beans.StringBean;
042: import org.htmlparser.lexer.Lexer;
043: import org.htmlparser.lexer.Page;
044: import org.htmlparser.util.ParserException;
045:
046: /**
047: * Extracts plain text from HTML.<p>
048: *
049: * @author Alexander Kandzior
050: *
051: * @version $Revision: 1.13 $
052: *
053: * @since 6.0.0
054: */
055: public final class CmsHtmlExtractor {
056:
057: /**
058: * Hides the public constructor.<p>
059: */
060: private CmsHtmlExtractor() {
061:
062: // hides the public constructor
063: }
064:
065: /**
066: * Extract the text from a HTML page.<p>
067: *
068: * @param in the html content input stream
069: * @param encoding the encoding of the content
070: *
071: * @return the extracted text from the page
072: * @throws ParserException if the parsing of the HTML failed
073: * @throws UnsupportedEncodingException if the given encoding is not supported
074: */
075: public static String extractText(InputStream in, String encoding)
076: throws ParserException, UnsupportedEncodingException {
077:
078: Parser parser = new Parser();
079: Lexer lexer = new Lexer();
080: Page page = new Page(in, encoding);
081: lexer.setPage(page);
082: parser.setLexer(lexer);
083:
084: StringBean stringBean = new StringBean();
085: parser.visitAllNodesWith(stringBean);
086:
087: String result = stringBean.getStrings();
088: return result == null ? "" : result;
089: }
090:
091: /**
092: * Extract the text from a HTML page.<p>
093: *
094: * @param content the html content
095: * @param encoding the encoding of the content
096: *
097: * @return the extracted text from the page
098: * @throws ParserException if the parsing of the HTML failed
099: * @throws UnsupportedEncodingException if the given encoding is not supported
100: */
101: public static String extractText(String content, String encoding)
102: throws ParserException, UnsupportedEncodingException {
103:
104: if (CmsStringUtil.isEmpty(content)) {
105: // if there is no HTML, then we don't need to extract anything
106: return content;
107: }
108:
109: // we must make sure that the content passed to the parser always is
110: // a "valid" HTML page, i.e. is surrounded by <html><body>...</body></html>
111: // otherwise you will get strange results for some specific HTML constructs
112: StringBuffer newContent = new StringBuffer(
113: content.length() + 32);
114:
115: newContent.append(CmsLinkProcessor.HTML_START);
116: newContent.append(content);
117: newContent.append(CmsLinkProcessor.HTML_END);
118:
119: // make sure the Lexer uses the right encoding
120: InputStream in = new ByteArrayInputStream(newContent.toString()
121: .getBytes(encoding));
122:
123: // use the stream based version to process the results
124: return extractText(in, encoding);
125: }
126: }
|