01: /*
02: * Copyright 2004 Outerthought bvba and Schaubroeck nv
03: *
04: * Licensed under the Apache License, Version 2.0 (the "License");
05: * you may not use this file except in compliance with the License.
06: * You may obtain a copy of the License at
07: *
08: * http://www.apache.org/licenses/LICENSE-2.0
09: *
10: * Unless required by applicable law or agreed to in writing, software
11: * distributed under the License is distributed on an "AS IS" BASIS,
12: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13: * See the License for the specific language governing permissions and
14: * limitations under the License.
15: */
16: package org.outerj.daisy.htmlcleaner;
17:
18: import java.io.OutputStream;
19: import java.io.ByteArrayOutputStream;
20: import org.outerj.daisy.xmlutil.SaxBuffer;
21:
22: /**
23: * Performs cleanup of HTML documents to well-formed HTML-as-XML documents.
24: *
25: * <p>More information:
26: * <ul>
27: * <li>To instantiate: see {@link HtmlCleanerFactory} and {@link HtmlCleanerTemplate}
28: * <li>About cleanup procedure: see {@link NekoHtmlParser}, {@link HtmlRepairer}
29: * and {@link StylingHtmlSerializer}.
30: * </ul>
31: */
32: public class HtmlCleaner {
33: private HtmlCleanerTemplate template;
34:
35: HtmlCleaner(HtmlCleanerTemplate template) {
36: this .template = template;
37: }
38:
39: /**
40: * Parses and cleans up the HTML, writing the result to the given outputstream,
41: * encoded as UTF-8.
42: */
43: public void clean(String somethingWhichLooksLikeHtml,
44: OutputStream outputStream) throws Exception {
45: NekoHtmlParser parser = new NekoHtmlParser();
46: SaxBuffer buffer = parser.parse(GeckoCorruptTagCleaner
47: .clean(somethingWhichLooksLikeHtml));
48:
49: StylingHtmlSerializer serializer = new StylingHtmlSerializer(
50: template);
51: serializer.setOutputStream(outputStream);
52: HtmlRepairer repairer = new HtmlRepairer(template);
53:
54: repairer.clean(buffer, new MergeCharacterEventsHandler(
55: serializer));
56: }
57:
58: public byte[] cleanToByteArray(String somethingWhichLooksLikeHtml)
59: throws Exception {
60: ByteArrayOutputStream os = new ByteArrayOutputStream(10000);
61: clean(somethingWhichLooksLikeHtml, os);
62: return os.toByteArray();
63: }
64:
65: public String cleanToString(String somethingWhichLooksLikeHtml)
66: throws Exception {
67: ByteArrayOutputStream os = new ByteArrayOutputStream(10000);
68: clean(somethingWhichLooksLikeHtml, os);
69: return os.toString("UTF-8");
70: }
71: }
|