001: /*
002: * Copyright 2002-2008 Andy Clark
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.cyberneko.html.filters;
018:
019: import java.util.Hashtable;
020:
021: import org.apache.xerces.xni.Augmentations;
022: import org.apache.xerces.xni.NamespaceContext;
023: import org.apache.xerces.xni.QName;
024: import org.apache.xerces.xni.XMLAttributes;
025: import org.apache.xerces.xni.XMLLocator;
026: import org.apache.xerces.xni.XMLResourceIdentifier;
027: import org.apache.xerces.xni.XMLString;
028: import org.apache.xerces.xni.XNIException;
029:
030: /**
031: * This class is a document filter capable of removing specified
032: * elements from the processing stream. There are two options for
033: * processing document elements:
034: * <ul>
035: * <li>specifying those elements which should be accepted and,
036: * optionally, which attributes of that element should be
037: * kept; and
038: * <li>specifying those elements whose tags and content should be
039: * completely removed from the event stream.
040: * </ul>
041: * <p>
042: * The first option allows the application to specify which elements
043: * appearing in the event stream should be accepted and, therefore,
044: * passed on to the next stage in the pipeline. All elements
045: * <em>not</em> in the list of acceptable elements have their start
046: * and end tags stripped from the event stream <em>unless</em> those
047: * elements appear in the list of elements to be removed.
048: * <p>
049: * The second option allows the application to specify which elements
050: * should be completely removed from the event stream. When an element
051: * appears that is to be removed, the element's start and end tag as
052: * well as all of that element's content is removed from the event
053: * stream.
054: * <p>
055: * A common use of this filter would be to only allow rich-text
056: * and linking elements as well as the character content to pass
057: * through the filter — all other elements would be stripped.
058: * The following code shows how to configure this filter to perform
059: * this task:
060: * <pre>
061: * ElementRemover remover = new ElementRemover();
062: * remover.acceptElement("b", null);
063: * remover.acceptElement("i", null);
064: * remover.acceptElement("u", null);
065: * remover.acceptElement("a", new String[] { "href" });
066: * </pre>
067: * <p>
068: * However, this would still allow the text content of other
069: * elements to pass through, which may not be desirable. In order
070: * to further "clean" the input, the <code>removeElement</code>
071: * option can be used. The following piece of code adds the ability
072: * to completely remove any <SCRIPT> tags and content
073: * from the stream.
074: * <pre>
075: * remover.removeElement("script");
076: * </pre>
077: * <p>
078: * <strong>Note:</strong>
079: * All text and accepted element children of a stripped element is
080: * retained. To completely remove an element's content, use the
081: * <code>removeElement</code> method.
082: * <p>
083: * <strong>Note:</strong>
084: * Care should be taken when using this filter because the output
085: * may not be a well-balanced tree. Specifically, if the application
086: * removes the <HTML> element (with or without retaining its
087: * children), the resulting document event stream will no longer be
088: * well-formed.
089: *
090: * @author Andy Clark
091: *
092: * @version $Id: ElementRemover.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
093: */
094: public class ElementRemover extends DefaultFilter {
095:
096: //
097: // Constants
098: //
099:
100: /** A "null" object. */
101: protected static final Object NULL = new Object();
102:
103: //
104: // Data
105: //
106:
107: // information
108:
109: /** Accepted elements. */
110: protected Hashtable fAcceptedElements = new Hashtable();
111:
112: /** Removed elements. */
113: protected Hashtable fRemovedElements = new Hashtable();
114:
115: // state
116:
117: /** The element depth. */
118: protected int fElementDepth;
119:
120: /** The element depth at element removal. */
121: protected int fRemovalElementDepth;
122:
123: //
124: // Public methods
125: //
126:
127: /**
128: * Specifies that the given element should be accepted and, optionally,
129: * which attributes of that element should be kept.
130: *
131: * @param element The element to accept.
132: * @param attributes The list of attributes to be kept or null if no
133: * attributes should be kept for this element.
134: *
135: * see #removeElement
136: */
137: public void acceptElement(String element, String[] attributes) {
138: Object key = element.toLowerCase();
139: Object value = NULL;
140: if (attributes != null) {
141: String[] newarray = new String[attributes.length];
142: for (int i = 0; i < attributes.length; i++) {
143: newarray[i] = attributes[i].toLowerCase();
144: }
145: value = attributes;
146: }
147: fAcceptedElements.put(key, value);
148: } // acceptElement(String,String[])
149:
150: /**
151: * Specifies that the given element should be completely removed. If an
152: * element is encountered during processing that is on the remove list,
153: * the element's start and end tags as well as all of content contained
154: * within the element will be removed from the processing stream.
155: *
156: * @param element The element to completely remove.
157: */
158: public void removeElement(String element) {
159: Object key = element.toLowerCase();
160: Object value = NULL;
161: fRemovedElements.put(key, value);
162: } // removeElement(String)
163:
164: //
165: // XMLDocumentHandler methods
166: //
167:
168: // since Xerces-J 2.2.0
169:
170: /** Start document. */
171: public void startDocument(XMLLocator locator, String encoding,
172: NamespaceContext nscontext, Augmentations augs)
173: throws XNIException {
174: fElementDepth = 0;
175: fRemovalElementDepth = Integer.MAX_VALUE;
176: super .startDocument(locator, encoding, nscontext, augs);
177: } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
178:
179: // old methods
180:
181: /** Start document. */
182: public void startDocument(XMLLocator locator, String encoding,
183: Augmentations augs) throws XNIException {
184: startDocument(locator, encoding, null, augs);
185: } // startDocument(XMLLocator,String,Augmentations)
186:
187: /** Start prefix mapping. */
188: public void startPrefixMapping(String prefix, String uri,
189: Augmentations augs) throws XNIException {
190: if (fElementDepth <= fRemovalElementDepth) {
191: super .startPrefixMapping(prefix, uri, augs);
192: }
193: } // startPrefixMapping(String,String,Augmentations)
194:
195: /** Start element. */
196: public void startElement(QName element, XMLAttributes attributes,
197: Augmentations augs) throws XNIException {
198: if (fElementDepth <= fRemovalElementDepth
199: && handleOpenTag(element, attributes)) {
200: super .startElement(element, attributes, augs);
201: }
202: fElementDepth++;
203: } // startElement(QName,XMLAttributes,Augmentations)
204:
205: /** Empty element. */
206: public void emptyElement(QName element, XMLAttributes attributes,
207: Augmentations augs) throws XNIException {
208: if (fElementDepth <= fRemovalElementDepth
209: && handleOpenTag(element, attributes)) {
210: super .emptyElement(element, attributes, augs);
211: }
212: } // emptyElement(QName,XMLAttributes,Augmentations)
213:
214: /** Comment. */
215: public void comment(XMLString text, Augmentations augs)
216: throws XNIException {
217: if (fElementDepth <= fRemovalElementDepth) {
218: super .comment(text, augs);
219: }
220: } // comment(XMLString,Augmentations)
221:
222: /** Processing instruction. */
223: public void processingInstruction(String target, XMLString data,
224: Augmentations augs) throws XNIException {
225: if (fElementDepth <= fRemovalElementDepth) {
226: super .processingInstruction(target, data, augs);
227: }
228: } // processingInstruction(String,XMLString,Augmentations)
229:
230: /** Characters. */
231: public void characters(XMLString text, Augmentations augs)
232: throws XNIException {
233: if (fElementDepth <= fRemovalElementDepth) {
234: super .characters(text, augs);
235: }
236: } // characters(XMLString,Augmentations)
237:
238: /** Ignorable whitespace. */
239: public void ignorableWhitespace(XMLString text, Augmentations augs)
240: throws XNIException {
241: if (fElementDepth <= fRemovalElementDepth) {
242: super .ignorableWhitespace(text, augs);
243: }
244: } // ignorableWhitespace(XMLString,Augmentations)
245:
246: /** Start general entity. */
247: public void startGeneralEntity(String name,
248: XMLResourceIdentifier id, String encoding,
249: Augmentations augs) throws XNIException {
250: if (fElementDepth <= fRemovalElementDepth) {
251: super .startGeneralEntity(name, id, encoding, augs);
252: }
253: } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
254:
255: /** Text declaration. */
256: public void textDecl(String version, String encoding,
257: Augmentations augs) throws XNIException {
258: if (fElementDepth <= fRemovalElementDepth) {
259: super .textDecl(version, encoding, augs);
260: }
261: } // textDecl(String,String,Augmentations)
262:
263: /** End general entity. */
264: public void endGeneralEntity(String name, Augmentations augs)
265: throws XNIException {
266: if (fElementDepth <= fRemovalElementDepth) {
267: super .endGeneralEntity(name, augs);
268: }
269: } // endGeneralEntity(String,Augmentations)
270:
271: /** Start CDATA section. */
272: public void startCDATA(Augmentations augs) throws XNIException {
273: if (fElementDepth <= fRemovalElementDepth) {
274: super .startCDATA(augs);
275: }
276: } // startCDATA(Augmentations)
277:
278: /** End CDATA section. */
279: public void endCDATA(Augmentations augs) throws XNIException {
280: if (fElementDepth <= fRemovalElementDepth) {
281: super .endCDATA(augs);
282: }
283: } // endCDATA(Augmentations)
284:
285: /** End element. */
286: public void endElement(QName element, Augmentations augs)
287: throws XNIException {
288: if (fElementDepth <= fRemovalElementDepth
289: && elementAccepted(element.rawname)) {
290: super .endElement(element, augs);
291: }
292: fElementDepth--;
293: if (fElementDepth == fRemovalElementDepth) {
294: fRemovalElementDepth = Integer.MAX_VALUE;
295: }
296: } // endElement(QName,Augmentations)
297:
298: /** End prefix mapping. */
299: public void endPrefixMapping(String prefix, Augmentations augs)
300: throws XNIException {
301: if (fElementDepth <= fRemovalElementDepth) {
302: super .endPrefixMapping(prefix, augs);
303: }
304: } // endPrefixMapping(String,Augmentations)
305:
306: //
307: // Protected methods
308: //
309:
310: /** Returns true if the specified element is accepted. */
311: protected boolean elementAccepted(String element) {
312: Object key = element.toLowerCase();
313: return fAcceptedElements.containsKey(key);
314: } // elementAccepted(String):boolean
315:
316: /** Returns true if the specified element should be removed. */
317: protected boolean elementRemoved(String element) {
318: Object key = element.toLowerCase();
319: return fRemovedElements.containsKey(key);
320: } // elementRemoved(String):boolean
321:
322: /** Handles an open tag. */
323: protected boolean handleOpenTag(QName element,
324: XMLAttributes attributes) {
325: if (elementAccepted(element.rawname)) {
326: Object key = element.rawname.toLowerCase();
327: Object value = fAcceptedElements.get(key);
328: if (value != NULL) {
329: String[] anames = (String[]) value;
330: int attributeCount = attributes.getLength();
331: LOOP: for (int i = 0; i < attributeCount; i++) {
332: String aname = attributes.getQName(i).toLowerCase();
333: for (int j = 0; j < anames.length; j++) {
334: if (anames[j].equals(aname)) {
335: continue LOOP;
336: }
337: }
338: attributes.removeAttributeAt(i--);
339: attributeCount--;
340: }
341: } else {
342: attributes.removeAllAttributes();
343: }
344: return true;
345: } else if (elementRemoved(element.rawname)) {
346: fRemovalElementDepth = fElementDepth;
347: }
348: return false;
349: } // handleOpenTag(QName,XMLAttributes):boolean
350:
351: } // class DefaultFilter
|