001: /*
002: * File : $Source: /usr/local/cvs/opencms/src/org/opencms/util/CmsHtmlParser.java,v $
003: * Date : $Date: 2008-02-27 12:05:36 $
004: * Version: $Revision: 1.8 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.util;
033:
034: import org.opencms.jsp.parse.DivTag;
035:
036: import java.util.ArrayList;
037: import java.util.Arrays;
038: import java.util.Iterator;
039: import java.util.List;
040:
041: import org.htmlparser.Parser;
042: import org.htmlparser.PrototypicalNodeFactory;
043: import org.htmlparser.Remark;
044: import org.htmlparser.Tag;
045: import org.htmlparser.Text;
046: import org.htmlparser.lexer.Lexer;
047: import org.htmlparser.lexer.Page;
048: import org.htmlparser.tags.Div;
049: import org.htmlparser.util.ParserException;
050: import org.htmlparser.visitors.NodeVisitor;
051:
052: /**
053: * Base utility class for OpenCms <code>{@link org.htmlparser.visitors.NodeVisitor}</code>
054: * implementations, which provides some often used utility functions.
055: * <p>
056: *
057: * This base implementation is only a "pass through" class, that is the content is parsed, but the
058: * generated result is exactly identical to the input.
059: * <p>
060: *
061: * @author Alexander Kandzior
062: *
063: * @version $Revision: 1.8 $
064: *
065: * @since 6.2.0
066: */
067: public class CmsHtmlParser extends NodeVisitor implements
068: I_CmsHtmlNodeVisitor {
069:
070: /** List of upper case tag name strings of tags that should not be auto-corrected if closing divs are missing. */
071: protected List m_noAutoCloseTags;
072:
073: /** The array of supported tag names. */
074: // important: don't change the order of these tags in the source, subclasses may expect the tags
075: // at the exact indices give here
076: // if you want to add tags, add them at the end
077: protected static final String[] TAG_ARRAY = new String[] { "H1",
078: "H2", "H3", "H4", "H5", "H6", "P", "DIV", "SPAN", "BR",
079: "OL", "UL", "LI", "TABLE", "TD", "TR", "TH", "THEAD",
080: "TBODY", "TFOOT" };
081:
082: /** The list of supported tag names. */
083: protected static final List TAG_LIST = Arrays.asList(TAG_ARRAY);
084:
085: /** Indicates if "echo" mode is on, that is all content is written to the result by default. */
086: protected boolean m_echo;
087:
088: /** The buffer to write the out to. */
089: protected StringBuffer m_result;
090:
091: /** The providable configuration - never null by contract of interface. */
092: private String m_configuration = "";
093:
094: /**
095: * Creates a new instance of the html converter with echo mode set to <code>false</code>.
096: * <p>
097: */
098: public CmsHtmlParser() {
099:
100: this (false);
101: }
102:
103: /**
104: * Creates a new instance of the html converter.
105: * <p>
106: *
107: * @param echo indicates if "echo" mode is on, that is all content is written to the result
108: */
109: public CmsHtmlParser(boolean echo) {
110:
111: m_result = new StringBuffer(1024);
112: m_echo = echo;
113: m_noAutoCloseTags = new ArrayList(32);
114: }
115:
116: /**
117: * Internally degrades Composite tags that do have children in the DOM tree
118: * to simple single tags. This allows to avoid auto correction of unclosed HTML tags.<p>
119: *
120: * @return A node factory that will not autocorrect open tags specified via <code>{@link #setNoAutoCloseTags(List)}</code>
121: */
122: protected PrototypicalNodeFactory configureNoAutoCorrectionTags() {
123:
124: PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
125:
126: String tagName;
127: Iterator it = m_noAutoCloseTags.iterator();
128: Div div = new Div();
129: List divNames = Arrays.asList(div.getIds());
130: while (it.hasNext()) {
131: tagName = ((String) it.next());
132: // div
133: if (divNames.contains(tagName)) {
134: factory.unregisterTag(new Div());
135: factory.registerTag(new DivTag());
136: }
137: // TODO: add more tags for flat parsing / non correction of missing closing tags here
138: }
139: return factory;
140: }
141:
142: /**
143: * @see org.opencms.util.I_CmsHtmlNodeVisitor#getConfiguration()
144: */
145: public String getConfiguration() {
146:
147: return m_configuration;
148: }
149:
150: /**
151: * @see org.opencms.util.I_CmsHtmlNodeVisitor#getResult()
152: */
153: public String getResult() {
154:
155: return m_result.toString();
156: }
157:
158: /**
159: * Returns the HTML for the given tag itself (not the tag content).
160: * <p>
161: *
162: * @param tag the tag to create the HTML for
163: *
164: * @return the HTML for the given tag
165: */
166: public String getTagHtml(Tag tag) {
167:
168: StringBuffer result = new StringBuffer(32);
169: result.append('<');
170: result.append(tag.getText());
171: result.append('>');
172: return result.toString();
173: }
174:
175: /**
176: * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String)
177: */
178: public String process(String html, String encoding)
179: throws ParserException {
180:
181: m_result = new StringBuffer();
182: Parser parser = new Parser();
183: Lexer lexer = new Lexer();
184:
185: // initialize the page with the given char set
186: Page page = new Page(html, encoding);
187: lexer.setPage(page);
188: parser.setLexer(lexer);
189:
190: if (m_noAutoCloseTags != null && m_noAutoCloseTags.size() > 0) {
191: // Degrade Composite tags that do have children in the DOM tree
192: // to simple single tags: This allows to finish this tag with opened HTML tags without the effect
193: // that html parser will generate the closing tags.
194: PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
195: lexer.setNodeFactory(factory);
196: }
197:
198: // process the page using the given visitor
199: parser.visitAllNodesWith(this );
200: // return the result
201: return getResult();
202: }
203:
204: /**
205: *
206: * @see org.opencms.util.I_CmsHtmlNodeVisitor#setConfiguration(java.lang.String)
207: */
208: public void setConfiguration(String configuration) {
209:
210: if (CmsStringUtil.isNotEmpty(configuration)) {
211: m_configuration = configuration;
212: }
213:
214: }
215:
216: /**
217: * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitEndTag(org.htmlparser.Tag)
218: */
219: public void visitEndTag(Tag tag) {
220:
221: if (m_echo) {
222: m_result.append(getTagHtml(tag));
223: }
224: }
225:
226: /**
227: * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitRemarkNode(org.htmlparser.Remark)
228: */
229: public void visitRemarkNode(Remark remark) {
230:
231: if (m_echo) {
232: m_result.append(remark.toHtml(true));
233: }
234: }
235:
236: /**
237: * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitStringNode(org.htmlparser.Text)
238: */
239: public void visitStringNode(Text text) {
240:
241: if (m_echo) {
242: m_result.append(text.getText());
243: }
244: }
245:
246: /**
247: * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitTag(org.htmlparser.Tag)
248: */
249: public void visitTag(Tag tag) {
250:
251: if (m_echo) {
252: m_result.append(getTagHtml(tag));
253: }
254: }
255:
256: /**
257: * Collapse HTML whitespace in the given String.<p>
258: *
259: * @param string the string to collapse
260: *
261: * @return the input String with all HTML whitespace collapsed
262: */
263: protected String collapse(String string) {
264:
265: int len = string.length();
266: StringBuffer result = new StringBuffer(len);
267: int state = 0;
268: for (int i = 0; i < len; i++) {
269: char c = string.charAt(i);
270: switch (c) {
271: // see HTML specification section 9.1 White space
272: // http://www.w3.org/TR/html4/struct/text.html#h-9.1
273: case '\u0020':
274: case '\u0009':
275: case '\u000C':
276: case '\u200B':
277: case '\r':
278: case '\n':
279: if (0 != state) {
280: state = 1;
281: }
282: break;
283: default:
284: if (1 == state) {
285: result.append(' ');
286: }
287: state = 2;
288: result.append(c);
289: }
290: }
291: return result.toString();
292: }
293:
294: /**
295: * Returns a list of upper case tag names for which parsing / visiting will not correct missing closing tags.<p>
296: *
297: * @return a List of upper case tag names for which parsing / visiting will not correct missing closing tags
298: */
299: public List getNoAutoCloseTags() {
300:
301: return m_noAutoCloseTags;
302: }
303:
304: /**
305: * Sets a list of upper case tag names for which parsing / visiting should not correct missing closing tags.<p>
306: *
307: * @param noAutoCloseTagList a list of upper case tag names for which parsing / visiting
308: * should not correct missing closing tags to set.
309: */
310: public void setNoAutoCloseTags(List noAutoCloseTagList) {
311:
312: // ensuring upper case
313: m_noAutoCloseTags.clear();
314: if (noAutoCloseTagList != null) {
315: Iterator it = noAutoCloseTagList.iterator();
316: while (it.hasNext()) {
317: m_noAutoCloseTags.add(((String) it.next())
318: .toUpperCase());
319: }
320: }
321: }
322: }
|