01: package com.meterware.httpunit.parsing;
02:
03: /********************************************************************************************************************
04: * $Id: JTidyHTMLParser.java,v 1.2 2002/12/25 15:23:11 russgold Exp $
05: *
06: * Copyright (c) 2002, Russell Gold
07: *
08: * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
09: * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
10: * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
11: * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12: *
13: * The above copyright notice and this permission notice shall be included in all copies or substantial portions
14: * of the Software.
15: *
16: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
17: * THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
19: * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20: * DEALINGS IN THE SOFTWARE.
21: *
22: *******************************************************************************************************************/
23: import org.w3c.tidy.Tidy;
24: import org.xml.sax.SAXException;
25:
26: import java.net.URL;
27: import java.io.IOException;
28: import java.io.ByteArrayInputStream;
29: import java.io.UnsupportedEncodingException;
30:
31: /**
32: *
33: * @author <a href="mailto:russgold@httpunit.org">Russell Gold</a>
34: **/
35: class JTidyHTMLParser implements HTMLParser {
36:
37: public void parse(URL pageURL, String pageText,
38: DocumentAdapter adapter) throws IOException, SAXException {
39: try {
40: adapter.setRootNode(getParser(pageURL).parseDOM(
41: new ByteArrayInputStream(pageText
42: .getBytes(UTF_ENCODING)), null));
43: } catch (UnsupportedEncodingException e) {
44: throw new RuntimeException("UTF-8 encoding failed");
45: }
46: }
47:
48: public String getCleanedText(String string) {
49: return (string == null) ? "" : string.replace(NBSP, ' ');
50: }
51:
52: public boolean supportsPreserveTagCase() {
53: return false;
54: }
55:
56: public boolean supportsReturnHTMLDocument() {
57: return false;
58: }
59:
60: public boolean supportsParserWarnings() {
61: return true;
62: }
63:
64: final private static char NBSP = (char) 160; // non-breaking space, defined by JTidy
65:
66: final private static String UTF_ENCODING = "UTF-8";
67:
68: private static Tidy getParser(URL url) {
69: Tidy tidy = new Tidy();
70: tidy.setCharEncoding(org.w3c.tidy.Configuration.UTF8);
71: tidy.setQuiet(true);
72: tidy.setShowWarnings(HTMLParserFactory
73: .isParserWarningsEnabled());
74: if (!HTMLParserFactory.getHTMLParserListeners().isEmpty()) {
75: tidy.setErrout(new JTidyPrintWriter(url));
76: }
77: return tidy;
78: }
79:
80: }
|