001: /*
002: * Copyright 2004 Outerthought bvba and Schaubroeck nv
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package org.outerj.daisy.htmlcleaner;
017:
018: import org.xml.sax.InputSource;
019: import org.w3c.dom.*;
020:
021: import javax.xml.parsers.DocumentBuilderFactory;
022: import javax.xml.parsers.DocumentBuilder;
023: import java.util.ArrayList;
024: import java.util.List;
025:
026: /**
027: * Builds a {@link HtmlCleanerTemplate} based on a XML configuration.
028: * The configuration describes such things as which elements and
029: * attributes should be kept, or how wide the output should be.
030: * See the example config files.
031: *
032: * <p>Instances of this class are not thread safe and not reusable,
033: * in other words construct a new HtmlCleanerFactory each time you
034: * need it.
035: */
036: public class HtmlCleanerFactory {
037: private boolean handledCleanup = false;
038: private boolean handledSerialization = false;
039: HtmlCleanerTemplate template = new HtmlCleanerTemplate();
040:
041: public HtmlCleanerTemplate buildTemplate(InputSource is)
042: throws Exception {
043: DocumentBuilderFactory dbf = DocumentBuilderFactory
044: .newInstance();
045: dbf.setNamespaceAware(true);
046: DocumentBuilder db = dbf.newDocumentBuilder();
047: Document document = db.parse(is);
048: document.normalize();
049:
050: Element docEl = document.getDocumentElement();
051: if (!(docEl.getLocalName().equals("htmlcleaner") && docEl
052: .getNamespaceURI() == null)) {
053: throw new Exception(
054: "Htmlcleaner config file should have root elemnet 'htmlcleaner'.");
055: }
056:
057: NodeList nodeList = docEl.getChildNodes();
058: for (int i = 0; i < nodeList.getLength(); i++) {
059: Node node = nodeList.item(i);
060:
061: if (node instanceof Element) {
062: if (node.getNamespaceURI() == null
063: && node.getLocalName().equals("cleanup")) {
064: handleCleanupNode((Element) node);
065: } else if (node.getNamespaceURI() == null
066: && node.getLocalName().equals("serialization")) {
067: handleSerializationNode((Element) node);
068: } else {
069: throw new Exception(
070: "Error in htmlcleaner config: unexpected element: "
071: + node.getNodeName());
072: }
073: }
074: }
075: template.initialize();
076: return template;
077: }
078:
079: private void handleCleanupNode(Element cleanupEl) throws Exception {
080: if (handledCleanup)
081: throw new Exception(
082: "Error in htmlcleaner config: cleanup element is only allowed once");
083: handledCleanup = true;
084:
085: NodeList cleanupNodes = cleanupEl.getChildNodes();
086: for (int k = 0; k < cleanupNodes.getLength(); k++) {
087: Node node = cleanupNodes.item(k);
088: if (node instanceof Element) {
089: if (node.getNamespaceURI() == null
090: && node.getLocalName().equals(
091: "allowed-span-classes")) {
092: String[] classes = getClassChildren((Element) node);
093: for (String clazz : classes)
094: template.addAllowedSpanClass(clazz);
095: } else if (node.getNamespaceURI() == null
096: && node.getLocalName().equals(
097: "allowed-div-classes")) {
098: String[] classes = getClassChildren((Element) node);
099: for (String clazz : classes)
100: template.addAllowedDivClass(clazz);
101: } else if (node.getNamespaceURI() == null
102: && node.getLocalName().equals(
103: "allowed-para-classes")) {
104: String[] classes = getClassChildren((Element) node);
105: for (String clazz : classes)
106: template.addAllowedParaClass(clazz);
107: } else if (node.getNamespaceURI() == null
108: && node.getLocalName().equals(
109: "allowed-pre-classes")) {
110: String[] classes = getClassChildren((Element) node);
111: for (String clazz : classes)
112: template.addAllowedPreClass(clazz);
113: } else if (node.getNamespaceURI() == null
114: && node.getLocalName().equals(
115: "drop-div-classes")) {
116: String[] classes = getClassChildren((Element) node);
117: for (String clazz : classes)
118: template.addDropDivClass(clazz);
119: } else if (node.getNamespaceURI() == null
120: && node.getLocalName().equals(
121: "allowed-elements")) {
122: handleAllowedElementsNode((Element) node);
123: } else if (node.getNamespaceURI() == null
124: && node.getLocalName().equals(
125: "img-alternate-src-attr")) {
126: String name = ((Element) node).getAttribute("name");
127: if (name.equals(""))
128: throw new Exception(
129: "Error in htmlcleaner config: missing name attribute on img-alternate-src-attr");
130: template.setImgAlternateSrcAttr(name);
131: } else if (node.getNamespaceURI() == null
132: && node.getLocalName().equals(
133: "link-alternate-href-attr")) {
134: String name = ((Element) node).getAttribute("name");
135: if (name.equals(""))
136: throw new Exception(
137: "Error in htmlcleaner config: missing name attribute on link-alternate-href-attr");
138: template.setLinkAlternateHrefAttr(name);
139: } else {
140: throw new Exception(
141: "Error in htmlcleaner config: unexpected element "
142: + node.getNodeName() + " inside "
143: + cleanupEl.getNodeName());
144: }
145: }
146: }
147:
148: }
149:
150: private String[] getClassChildren(Element element) throws Exception {
151: List<String> classes = new ArrayList<String>();
152: NodeList nodeList = element.getChildNodes();
153: for (int i = 0; i < nodeList.getLength(); i++) {
154: Node node = nodeList.item(i);
155: if (node instanceof Element) {
156: if (node.getNamespaceURI() == null
157: && node.getLocalName().equals("class")) {
158: Node text = node.getFirstChild();
159: if (text instanceof Text) {
160: classes.add(((Text) text).getData());
161: } else {
162: throw new Exception(
163: "Error in htmlcleaner: element class does not have a text node child");
164: }
165: } else {
166: throw new Exception(
167: "Error in htmlcleaner config: unexpected element: "
168: + node.getNodeName()
169: + " as child of "
170: + element.getNodeName());
171: }
172: }
173: }
174: return classes.toArray(new String[classes.size()]);
175: }
176:
177: private void handleAllowedElementsNode(Element element)
178: throws Exception {
179: NodeList children = element.getChildNodes();
180: for (int i = 0; i < children.getLength(); i++) {
181: Node node = children.item(i);
182:
183: if (node instanceof Element) {
184: if (node.getNamespaceURI() == null
185: && node.getLocalName().equals("element")) {
186: String name = ((Element) node).getAttribute("name");
187: if (name.equals(""))
188: throw new Exception(
189: "Error in htmlcleaner config: missing name attribute on 'element' element");
190: String[] attributes = getAttributeChildren((Element) node);
191: template.addAllowedElement(name, attributes);
192: } else {
193: throw new Exception(
194: "Error in htmlcleaner config: unexpected element: '"
195: + node.getNodeName()
196: + "' as child of "
197: + element.getNodeName());
198: }
199: }
200: }
201: }
202:
203: private String[] getAttributeChildren(Element element)
204: throws Exception {
205: List<String> names = new ArrayList<String>();
206: NodeList children = element.getChildNodes();
207: for (int i = 0; i < children.getLength(); i++) {
208: Node node = children.item(i);
209: if (node instanceof Element) {
210: if (node.getNamespaceURI() == null
211: && node.getLocalName().equals("attribute")) {
212: String name = ((Element) node).getAttribute("name");
213: if (name.equals(""))
214: throw new Exception(
215: "Error in htmlcleaner config: missing name attribute on attribute element");
216: names.add(name);
217: } else {
218: throw new Exception(
219: "Error in htmlcleaner config: unexpected element: '"
220: + node.getNodeName()
221: + "' as child of "
222: + element.getNodeName());
223: }
224: }
225: }
226: return names.toArray(new String[names.size()]);
227: }
228:
229: private void handleSerializationNode(Element element)
230: throws Exception {
231: if (handledSerialization)
232: throw new Exception(
233: "Error in htmlcleaner config: serialization element is only allowed once");
234: handledSerialization = true;
235:
236: NodeList children = element.getChildNodes();
237: for (int i = 0; i < children.getLength(); i++) {
238: Node node = children.item(i);
239: if (node instanceof Element) {
240: if (node.getNamespaceURI() == null
241: && node.getLocalName().equals("linewidth")) {
242: String value = ((Element) node)
243: .getAttribute("value");
244: if (value.equals(""))
245: throw new Exception(
246: "Error in htmlcleaner config: missing value attribute on linewidth element.");
247: int intValue = Integer.parseInt(value);
248: template.setMaxLineWidth(intValue);
249: } else if (node.getNamespaceURI() == null
250: && node.getLocalName().equals("elements")) {
251: handleElementsNode((Element) node);
252: } else {
253: throw new Exception(
254: "Error in htmlcleaner config: unexpected element '"
255: + node.getNodeName()
256: + "' as child of "
257: + element.getNodeName());
258: }
259: }
260: }
261: }
262:
263: private void handleElementsNode(Element element) throws Exception {
264: NodeList children = element.getChildNodes();
265: for (int i = 0; i < children.getLength(); i++) {
266: Node node = children.item(i);
267: if (node instanceof Element) {
268: if (node.getNamespaceURI() == null
269: && node.getLocalName().equals("element")) {
270: Element childEl = (Element) node;
271: String name = childEl.getAttribute("name");
272: if (name.equals(""))
273: throw new Exception(
274: "Error in htmlcleaner config: missing name attribute on 'element' element.");
275: String beforeOpenAttr = childEl
276: .getAttribute("beforeOpen");
277: String afterOpenAttr = childEl
278: .getAttribute("afterOpen");
279: String beforeCloseAttr = childEl
280: .getAttribute("beforeClose");
281: String afterCloseAttr = childEl
282: .getAttribute("afterClose");
283: int beforeOpen = 0, afterOpen = 0, beforeClose = 0, afterClose = 0;
284: if (!beforeOpenAttr.equals(""))
285: beforeOpen = Integer.parseInt(beforeOpenAttr);
286: if (!afterOpenAttr.equals(""))
287: afterOpen = Integer.parseInt(afterOpenAttr);
288: if (!beforeCloseAttr.equals(""))
289: beforeClose = Integer.parseInt(beforeCloseAttr);
290: if (!afterCloseAttr.equals(""))
291: afterClose = Integer.parseInt(afterCloseAttr);
292: boolean inline = "true".equals(childEl
293: .getAttribute("inline"));
294: template.addOutputElement(name, beforeOpen,
295: afterOpen, beforeClose, afterClose, inline);
296: }
297: }
298: }
299: }
300: }
|