001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.transformation;
018:
019: import java.io.ByteArrayInputStream;
020: import java.io.IOException;
021: import java.util.HashMap;
022: import java.util.Iterator;
023: import java.util.Map;
024: import java.util.Properties;
025: import java.util.StringTokenizer;
026:
027: import org.apache.avalon.framework.configuration.Configurable;
028: import org.apache.avalon.framework.configuration.Configuration;
029: import org.apache.avalon.framework.configuration.ConfigurationException;
030: import org.apache.avalon.framework.parameters.Parameters;
031: import org.apache.cocoon.ProcessingException;
032: import org.apache.cocoon.environment.SourceResolver;
033: import org.apache.cocoon.xml.dom.DOMBuilder;
034: import org.apache.cocoon.xml.IncludeXMLConsumer;
035: import org.apache.excalibur.source.Source;
036: import org.apache.xerces.parsers.AbstractSAXParser;
037: import org.cyberneko.html.HTMLConfiguration;
038: import org.w3c.dom.Document;
039: import org.xml.sax.Attributes;
040: import org.xml.sax.InputSource;
041: import org.xml.sax.SAXException;
042:
043: /**
044: * Converts (escaped) HTML snippets into tidied HTML using the NekoHTML library.
045: * This transformer expects a list of elements, passed as comma separated
046: * values of the "tags" parameter. It records the text enclosed in such
047: * elements and pass it thru Neko to obtain valid XHTML.
048: *
049: * @version $Id: NekoHTMLTransformer.java 433543 2006-08-22 06:22:54Z crossley $
050: */
051: public class NekoHTMLTransformer extends AbstractSAXTransformer
052: implements Configurable {
053:
054: /**
055: * Properties for Neko format
056: */
057: private Properties properties;
058:
059: /**
060: * Tags that must be normalized
061: */
062: private Map tags;
063:
064: /**
065: * React on endElement calls that contain a tag to be
066: * tidied and run Neko on it, otherwise passthru.
067: *
068: * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
069: */
070: public void endElement(String uri, String name, String raw)
071: throws SAXException {
072: if (this .tags.containsKey(name)) {
073: String toBeNormalized = this .endTextRecording();
074: try {
075: this .normalize(toBeNormalized);
076: } catch (ProcessingException e) {
077: e.printStackTrace();
078: }
079: }
080: super .endElement(uri, name, raw);
081: }
082:
083: /**
084: * Start buffering text if inside a tag to be normalized,
085: * passthru otherwise.
086: *
087: * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
088: */
089: public void startElement(String uri, String name, String raw,
090: Attributes attr) throws SAXException {
091: super .startElement(uri, name, raw, attr);
092: if (this .tags.containsKey(name)) {
093: this .startTextRecording();
094: }
095: }
096:
097: /**
098: * Configure this transformer, possibly passing to it
099: * a jtidy configuration file location.
100: */
101: public void configure(Configuration config)
102: throws ConfigurationException {
103: super .configure(config);
104:
105: String configUrl = config.getChild("neko-config")
106: .getValue(null);
107: if (configUrl != null) {
108: org.apache.excalibur.source.SourceResolver resolver = null;
109: Source configSource = null;
110: try {
111: resolver = (org.apache.excalibur.source.SourceResolver) this .manager
112: .lookup(org.apache.excalibur.source.SourceResolver.ROLE);
113: configSource = resolver.resolveURI(configUrl);
114: if (getLogger().isDebugEnabled()) {
115: getLogger().debug(
116: "Loading configuration from "
117: + configSource.getURI());
118: }
119: this .properties = new Properties();
120: this .properties.load(configSource.getInputStream());
121:
122: } catch (Exception e) {
123: getLogger().warn(
124: "Cannot load configuration from " + configUrl);
125: throw new ConfigurationException(
126: "Cannot load configuration from " + configUrl,
127: e);
128: } finally {
129: if (null != resolver) {
130: this .manager.release(resolver);
131: resolver.release(configSource);
132: }
133: }
134: }
135: }
136:
137: /**
138: * The beef: run Neko on the buffered text and stream
139: * the result
140: *
141: * @param text the string to be tidied
142: */
143: private void normalize(String text) throws ProcessingException {
144: try {
145: HtmlSaxParser parser = new HtmlSaxParser(this .properties);
146:
147: ByteArrayInputStream bais = new ByteArrayInputStream(text
148: .getBytes());
149:
150: DOMBuilder builder = new DOMBuilder();
151: parser.setContentHandler(builder);
152: parser.parse(new InputSource(bais));
153: Document doc = builder.getDocument();
154:
155: IncludeXMLConsumer.includeNode(doc, this .contentHandler,
156: this .lexicalHandler);
157: } catch (Exception e) {
158: throw new ProcessingException(
159: "Exception in NekoHTMLTransformer.normalize()", e);
160: }
161: }
162:
163: /**
164: * Setup this component, passing the tag names to be tidied.
165: */
166:
167: public void setup(SourceResolver resolver, Map objectModel,
168: String src, Parameters par) throws ProcessingException,
169: SAXException, IOException {
170: super .setup(resolver, objectModel, src, par);
171: String tagsParam = par.getParameter("tags", "");
172: if (getLogger().isDebugEnabled()) {
173: getLogger().debug("tags: " + tagsParam);
174: }
175: this .tags = new HashMap();
176: StringTokenizer tokenizer = new StringTokenizer(tagsParam, ",");
177: while (tokenizer.hasMoreElements()) {
178: String tok = tokenizer.nextToken().trim();
179: this .tags.put(tok, tok);
180: }
181: }
182:
183: public static class HtmlSaxParser extends AbstractSAXParser {
184:
185: public HtmlSaxParser(Properties properties) {
186: super (getConfig(properties));
187: }
188:
189: private static HTMLConfiguration getConfig(Properties properties) {
190: HTMLConfiguration config = new HTMLConfiguration();
191: config.setProperty(
192: "http://cyberneko.org/html/properties/names/elems",
193: "lower");
194: if (properties != null) {
195: for (Iterator i = properties.keySet().iterator(); i
196: .hasNext();) {
197: String name = (String) i.next();
198: if (name.indexOf("/features/") > -1) {
199: config.setFeature(name, Boolean
200: .getBoolean(properties
201: .getProperty(name)));
202: } else if (name.indexOf("/properties/") > -1) {
203: config.setProperty(name, properties
204: .getProperty(name));
205: }
206: }
207: }
208: return config;
209: }
210: }
211: }
|