001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.portal.util;
018:
019: import java.io.IOException;
020: import java.io.StringReader;
021: import java.util.Iterator;
022: import java.util.Properties;
023:
024: import org.apache.cocoon.xml.ContentHandlerWrapper;
025: import org.apache.excalibur.xml.sax.XMLConsumer;
026: import org.apache.xerces.parsers.AbstractSAXParser;
027: import org.cyberneko.html.HTMLConfiguration;
028: import org.xml.sax.Attributes;
029: import org.xml.sax.ContentHandler;
030: import org.xml.sax.InputSource;
031: import org.xml.sax.SAXException;
032: import org.xml.sax.ext.LexicalHandler;
033:
034: /**
035: * This parser uses the nekohtml parser to parse html and generate sax streams.
036: *
037: * @version $Id: HtmlSaxParser.java 433543 2006-08-22 06:22:54Z crossley $
038: */
039: public class HtmlSaxParser extends AbstractSAXParser {
040:
041: public HtmlSaxParser(Properties properties) {
042: super (getConfig(properties));
043: }
044:
045: protected static HTMLConfiguration getConfig(Properties properties) {
046: HTMLConfiguration config = new HTMLConfiguration();
047: config.setProperty(
048: "http://cyberneko.org/html/properties/names/elems",
049: "lower");
050: if (properties != null) {
051: for (Iterator i = properties.keySet().iterator(); i
052: .hasNext();) {
053: String name = (String) i.next();
054: config.setProperty(name, properties.getProperty(name));
055: }
056: }
057: return config;
058: }
059:
060: /**
061: * Parse html stored in the string.
062: */
063: public static void parseString(String content, ContentHandler ch)
064: throws SAXException {
065: final HtmlSaxParser parser = new HtmlSaxParser(null);
066: parser.setContentHandler(ch);
067: if (ch instanceof LexicalHandler) {
068: parser.setLexicalHandler((LexicalHandler) ch);
069: }
070: final InputSource is = new InputSource(
071: new StringReader(content));
072: try {
073: parser.parse(is);
074: } catch (IOException ioe) {
075: throw new SAXException(
076: "Error during parsing of html markup.", ioe);
077: }
078: }
079:
080: public static XMLConsumer getContentFilter(ContentHandler ch) {
081: return new ContentFilter(ch);
082: }
083:
084: protected static final class ContentFilter extends
085: ContentHandlerWrapper {
086:
087: public ContentFilter(ContentHandler ch) {
088: this .setContentHandler(ch);
089: if (ch instanceof LexicalHandler) {
090: this .setLexicalHandler((LexicalHandler) ch);
091: }
092: }
093:
094: /**
095: * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
096: */
097: public void endElement(String uri, String loc, String raw)
098: throws SAXException {
099: if (!loc.equals("html") && !loc.equals("body")) {
100: super .endElement(uri, loc, raw);
101: }
102: }
103:
104: /**
105: * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
106: */
107: public void startElement(String uri, String loc, String raw,
108: Attributes a) throws SAXException {
109: if (!loc.equals("html") && !loc.equals("body")) {
110: super.startElement(uri, loc, raw, a);
111: }
112: }
113: }
114:
115: }
|