001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.cocoon.transformation;
018:
019: import java.io.BufferedInputStream;
020: import java.io.ByteArrayInputStream;
021: import java.io.IOException;
022: import java.io.PrintWriter;
023: import java.io.StringWriter;
024: import java.util.HashMap;
025: import java.util.Map;
026: import java.util.Properties;
027: import java.util.StringTokenizer;
028:
029: import org.apache.avalon.framework.configuration.Configurable;
030: import org.apache.avalon.framework.configuration.Configuration;
031: import org.apache.avalon.framework.configuration.ConfigurationException;
032: import org.apache.avalon.framework.parameters.Parameters;
033: import org.apache.cocoon.ProcessingException;
034: import org.apache.cocoon.environment.SourceResolver;
035: import org.apache.cocoon.transformation.AbstractSAXTransformer;
036: import org.apache.cocoon.xml.XMLUtils;
037: import org.apache.cocoon.xml.IncludeXMLConsumer;
038: import org.apache.excalibur.source.Source;
039: import org.w3c.tidy.Tidy;
040: import org.xml.sax.Attributes;
041: import org.xml.sax.SAXException;
042:
043: /**
044: * Converts (escaped) HTML snippets into JTidied HTML.
045: * This transformer expects a list of elements, passed as comma separated
046: * values of the "tags" parameter. It records the text enclosed in such
047: * elements and pass it thru JTidy to obtain valid XHTML.
048: *
049: * <p>TODO: Add namespace support.
050: * <p><strong>WARNING:</strong> This transformer should be considered unstable.
051: *
052: * @author <a href="mailto:d.madama@pro-netics.com">Daniele Madama</a>
053: * @author <a href="mailto:gianugo@apache.org">Gianugo Rabellino</a>
054: *
055: * @version CVS $Id: HTMLTransformer.java 433543 2006-08-22 06:22:54Z crossley $
056: */
057: public class HTMLTransformer extends AbstractSAXTransformer implements
058: Configurable {
059:
060: /**
061: * Properties for Tidy format
062: */
063: private Properties properties;
064:
065: /**
066: * Tags that must be normalized
067: */
068: private Map tags;
069:
070: /**
071: * React on endElement calls that contain a tag to be
072: * tidied and run Jtidy on it, otherwise passthru.
073: *
074: * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
075: */
076: public void endElement(String uri, String name, String raw)
077: throws SAXException {
078: if (this .tags.containsKey(name)) {
079: String toBeNormalized = this .endTextRecording();
080: try {
081: this .normalize(toBeNormalized);
082: } catch (ProcessingException e) {
083: e.printStackTrace();
084: }
085: }
086: super .endElement(uri, name, raw);
087: }
088:
089: /**
090: * Start buffering text if inside a tag to be normalized,
091: * passthru otherwise.
092: *
093: * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
094: */
095: public void startElement(String uri, String name, String raw,
096: Attributes attr) throws SAXException {
097: super .startElement(uri, name, raw, attr);
098: if (this .tags.containsKey(name)) {
099: this .startTextRecording();
100: }
101: }
102:
103: /**
104: * Configure this transformer, possibly passing to it
105: * a jtidy configuration file location.
106: */
107: public void configure(Configuration config)
108: throws ConfigurationException {
109: super .configure(config);
110:
111: String configUrl = config.getChild("jtidy-config").getValue(
112: null);
113: if (configUrl != null) {
114: org.apache.excalibur.source.SourceResolver resolver = null;
115: Source configSource = null;
116: try {
117: resolver = (org.apache.excalibur.source.SourceResolver) this .manager
118: .lookup(org.apache.excalibur.source.SourceResolver.ROLE);
119: configSource = resolver.resolveURI(configUrl);
120: if (getLogger().isDebugEnabled()) {
121: getLogger().debug(
122: "Loading configuration from "
123: + configSource.getURI());
124: }
125: this .properties = new Properties();
126: this .properties.load(configSource.getInputStream());
127:
128: } catch (Exception e) {
129: getLogger().warn(
130: "Cannot load configuration from " + configUrl);
131: throw new ConfigurationException(
132: "Cannot load configuration from " + configUrl,
133: e);
134: } finally {
135: if (null != resolver) {
136: this .manager.release(resolver);
137: resolver.release(configSource);
138: }
139: }
140: }
141: }
142:
143: /**
144: * The beef: run JTidy on the buffered text and stream
145: * the result
146: *
147: * @param text the string to be tidied
148: */
149: private void normalize(String text) throws ProcessingException {
150: try {
151: // Setup an instance of Tidy.
152: Tidy tidy = new Tidy();
153: tidy.setXmlOut(true);
154:
155: if (this .properties == null) {
156: tidy.setXHTML(true);
157: } else {
158: tidy.setConfigurationFromProps(this .properties);
159: }
160:
161: //Set Jtidy warnings on-off
162: tidy.setShowWarnings(getLogger().isWarnEnabled());
163: //Set Jtidy final result summary on-off
164: tidy.setQuiet(!getLogger().isInfoEnabled());
165: //Set Jtidy infos to a String (will be logged) instead of System.out
166: StringWriter stringWriter = new StringWriter();
167: PrintWriter errorWriter = new PrintWriter(stringWriter);
168: tidy.setErrout(errorWriter);
169:
170: // Extract the document using JTidy and stream it.
171: ByteArrayInputStream bais = new ByteArrayInputStream(text
172: .getBytes());
173: org.w3c.dom.Document doc = tidy.parseDOM(
174: new BufferedInputStream(bais), null);
175:
176: // FIXME: Jtidy doesn't warn or strip duplicate attributes in same
177: // tag; stripping.
178: XMLUtils.stripDuplicateAttributes(doc, null);
179:
180: errorWriter.flush();
181: errorWriter.close();
182: if (getLogger().isWarnEnabled()) {
183: getLogger().warn(stringWriter.toString());
184: }
185:
186: IncludeXMLConsumer.includeNode(doc, this .contentHandler,
187: this .lexicalHandler);
188: } catch (Exception e) {
189: throw new ProcessingException(
190: "Exception in HTMLTransformer.normalize()", e);
191: }
192: }
193:
194: /**
195: * Setup this component, passing the tag names to be tidied.
196: */
197:
198: public void setup(SourceResolver resolver, Map objectModel,
199: String src, Parameters par) throws ProcessingException,
200: SAXException, IOException {
201: super .setup(resolver, objectModel, src, par);
202: String tagsParam = par.getParameter("tags", "");
203: if (getLogger().isDebugEnabled()) {
204: getLogger().debug("tags: " + tagsParam);
205: }
206: this .tags = new HashMap();
207: StringTokenizer tokenizer = new StringTokenizer(tagsParam, ",");
208: while (tokenizer.hasMoreElements()) {
209: String tok = tokenizer.nextToken().trim();
210: this.tags.put(tok, tok);
211: }
212: }
213: }
|