001: /*
002: ******************************************************************
003: Copyright (c) 2001-2007, Jeff Martin, Tim Bacon
004: All rights reserved.
005:
006: Redistribution and use in source and binary forms, with or without
007: modification, are permitted provided that the following conditions
008: are met:
009:
010: * Redistributions of source code must retain the above copyright
011: notice, this list of conditions and the following disclaimer.
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the following
014: disclaimer in the documentation and/or other materials provided
015: with the distribution.
016: * Neither the name of the xmlunit.sourceforge.net nor the names
017: of its contributors may be used to endorse or promote products
018: derived from this software without specific prior written
019: permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
022: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
023: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
024: FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
025: COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
026: INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
027: BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
028: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
029: CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
030: LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
031: ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
032: POSSIBILITY OF SUCH DAMAGE.
033:
034: ******************************************************************
035: */
036:
037: package org.custommonkey.xmlunit;
038:
039: import java.io.IOException;
040: import java.io.Reader;
041: import java.io.StringReader;
042: import java.util.Enumeration;
043:
044: import javax.swing.text.*;
045: import javax.swing.text.html.*;
046: import javax.swing.text.html.parser.*;
047:
048: import org.w3c.dom.Document;
049:
050: import org.xml.sax.ContentHandler;
051: import org.xml.sax.Attributes;
052: import org.xml.sax.SAXException;
053: import org.xml.sax.ext.LexicalHandler;
054: import org.xml.sax.helpers.AttributesImpl;
055:
056: /**
057: * Build a DOM document from HTML content converting from 'plain' HTML into
058: * 'XHTML' along the way with the help of a TolerantSaxDocumentBuilder and
059: * the Swing html parser classes.
060: * This allows XML assertions to be made against badly formed HTML.
061: * <br />Examples and more at <a href="http://xmlunit.sourceforge.net"/>xmlunit.sourceforge.net</a>
062: * @see TolerantSaxDocumentBuilder
063: */
064: public class HTMLDocumentBuilder {
065: protected final TolerantSaxDocumentBuilder tolerantSaxDocumentBuilder;
066: protected final SwingEvent2SaxAdapter swingEvent2SaxAdapter;
067: private final StringBuffer traceBuffer;
068:
069: /**
070: * Constructor
071: * @param tolerantSaxDocumentBuilder the instance that will receive SAX
072: * calls generated as the HTML is parsed and build up a DOM Document
073: */
074: public HTMLDocumentBuilder(
075: TolerantSaxDocumentBuilder tolerantSaxDocumentBuilder) {
076: this .tolerantSaxDocumentBuilder = tolerantSaxDocumentBuilder;
077: this .swingEvent2SaxAdapter = new SwingEvent2SaxAdapter();
078: this .traceBuffer = new StringBuffer();
079: }
080:
081: /**
082: * @return a DOM document parsed from the Reader via an SwingEvent2SaxAdapter
083: * and TolerantSaxBuilder.
084: * Not thread-safe!
085: * @see TolerantSaxDocumentBuilder
086: */
087: public Document parse(Reader reader) throws SAXException,
088: IOException {
089: traceBuffer.delete(0, traceBuffer.length());
090: swingEvent2SaxAdapter.parse(reader, tolerantSaxDocumentBuilder);
091: traceBuffer.append(tolerantSaxDocumentBuilder.getTrace());
092: return tolerantSaxDocumentBuilder.getDocument();
093: }
094:
095: /**
096: * @return a DOM document parsed from the String via an SwingEvent2SaxAdapter
097: * and TolerantSaxBuilder.
098: * Not thread-safe!
099: * @see TolerantSaxDocumentBuilder
100: */
101: public Document parse(String htmlString) throws SAXException,
102: IOException {
103: return parse(new StringReader(htmlString));
104: }
105:
106: /**
107: * @return the trace of events and / or warnings encountered during parsing
108: */
109: public String getTrace() {
110: return traceBuffer.toString();
111: }
112:
113: /**
114: * Append to the log built up during parsing
115: * @param msg what to append
116: */
117: private void trace(String msg) {
118: traceBuffer.append(msg).append('\n');
119: }
120:
121: /**
122: * Adapts Swing HTML callback messages to Sax equivalents, passing them
123: * to a Sax-aware ContentHandler.
124: */
125: public class SwingEvent2SaxAdapter extends
126: HTMLEditorKit.ParserCallback {
127: private static final boolean IGNORE_HTML_CHAR_SET = true;
128: private final AttributesImpl attributes;
129: private final ParserDelegator delegator;
130: private boolean lastTagWasSimpleTag;
131: private ContentHandler saxContentHandler;
132: private SAXException firstUnhandledException;
133:
134: /**
135: * Default constructor
136: */
137: public SwingEvent2SaxAdapter() {
138: this .attributes = new AttributesImpl();
139: this .delegator = new ParserDelegator();
140: }
141:
142: /**
143: * Perform Swing-HTML-parse-event-to-Sax-event conversion
144: */
145: public void parse(Reader reader,
146: ContentHandler saxContentHandler) throws SAXException,
147: IOException {
148: this .saxContentHandler = saxContentHandler;
149: preParse();
150: delegator.parse(reader, this , IGNORE_HTML_CHAR_SET);
151: postParse();
152: }
153:
154: /**
155: * Equivalent to Sax <code>startDocument</code>
156: * @throws SAXException
157: */
158: private void preParse() throws SAXException {
159: firstUnhandledException = null;
160: saxContentHandler.startDocument();
161: }
162:
163: /**
164: * Equivalent to Sax <code>endDocument</code>
165: * @throws SAXException if any SAXExceptions have occurred during
166: * parsing
167: */
168: private void postParse() throws SAXException {
169: try {
170: saxContentHandler.endDocument();
171: } catch (SAXException e) {
172: handleSAXException(e);
173: }
174: if (firstUnhandledException != null) {
175: throw firstUnhandledException;
176: }
177: }
178:
179: /**
180: * Swing-HTML-parser template method, no ContentHandler equivalent
181: */
182: public void flush()
183: throws javax.swing.text.BadLocationException {
184: }
185:
186: /**
187: * Equivalent to Sax <code>characters</code>
188: */
189: public void handleText(char[] data, int pos) {
190: try {
191: int startPos;
192: if (lastTagWasSimpleTag) {
193: startPos = getStartIgnoringClosingSimpleTag(data);
194: } else {
195: startPos = 0;
196: }
197: if (startPos < data.length) {
198: saxContentHandler.characters(data, startPos,
199: data.length - startPos);
200: }
201: } catch (SAXException e) {
202: handleSAXException(e);
203: }
204: }
205:
206: /**
207: * Adjusts the start offset into the character array for the fact that
208: * the Swing HTML parser doesn't handle simple tags with explicit
209: * closing angle brackets e.g. <hr/>
210: * @param data
211: * @return offset of actual character data into the array
212: */
213: private int getStartIgnoringClosingSimpleTag(char[] data) {
214: if (data[0] == '>') {
215: return 1;
216: }
217: return 0;
218: }
219:
220: /**
221: * Equivalent to Sax LexicalHandler <code>comment</code> method.
222: * If the supplied ContentHandler is also an LexicalHandler then the
223: * cast will be made and the sax event passed on.
224: */
225: public void handleComment(char[] data, int pos) {
226: if (saxContentHandler instanceof LexicalHandler) {
227: try {
228: ((LexicalHandler) saxContentHandler).comment(data,
229: 0, data.length);
230: } catch (SAXException e) {
231: handleSAXException(e);
232: }
233: } else {
234: trace("Unhandled comment " + new String(data));
235: }
236: }
237:
238: /**
239: * Equivalent to Sax <code>startElement</code>
240: */
241: public void handleStartTag(javax.swing.text.html.HTML.Tag tag,
242: javax.swing.text.MutableAttributeSet attributeSet,
243: int pos) {
244: try {
245: saxContentHandler.startElement("", "", tag.toString(),
246: convertToSaxAttributes(attributeSet));
247: } catch (SAXException e) {
248: handleSAXException(e);
249: }
250: lastTagWasSimpleTag = false;
251: }
252:
253: /**
254: * Equivalent to Sax <code>endElement</code>
255: */
256: public void handleEndTag(javax.swing.text.html.HTML.Tag tag,
257: int pos) {
258: try {
259: saxContentHandler.endElement("", "", tag.toString());
260: } catch (SAXException e) {
261: handleSAXException(e);
262: }
263: }
264:
265: /**
266: * Equivalent to Sax <code>startElement</code> plus
267: * <code>endElement</code>
268: */
269: public void handleSimpleTag(javax.swing.text.html.HTML.Tag tag,
270: javax.swing.text.MutableAttributeSet attributeSet,
271: int pos) {
272: handleStartTag(tag, attributeSet, pos);
273: handleEndTag(tag, pos);
274: lastTagWasSimpleTag = true;
275: }
276:
277: /**
278: * Swing-HTML-parser template method, no ContentHandler equivalent.
279: * These errors are generally recoverable, so they are logged.
280: */
281: public void handleError(String errorMsg, int pos) {
282: trace("HTML ERROR: " + errorMsg);
283: }
284:
285: /**
286: * Simple conversion method.
287: * @param attributeSet
288: * @return Sax CDATA Attributes from the Swing MutableAttributeSet
289: */
290: private Attributes convertToSaxAttributes(
291: MutableAttributeSet attributeSet) {
292: Object attrName, attrValue;
293:
294: attributes.clear();
295: for (Enumeration en = attributeSet.getAttributeNames(); en
296: .hasMoreElements();) {
297: attrName = en.nextElement();
298: attrValue = attributeSet.getAttribute(attrName);
299: attributes.addAttribute("", "", attrName.toString(),
300: "CDATA", attrValue.toString());
301: }
302:
303: return attributes;
304: }
305:
306: /**
307: * Log an error from the ContentHandler for raising post-parse
308: */
309: private void handleSAXException(SAXException e) {
310: trace("SAX Error: " + e.getMessage());
311: if (firstUnhandledException == null) {
312: firstUnhandledException = e;
313: }
314: }
315: }
316:
317: }
|