001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of HtmlCleaner may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035: subject line.
036: */
037:
038: package org.htmlcleaner;
039:
040: import java.util.*;
041:
042: /**
043: * <p>
044: * Class contains information about single HTML tag.<br/>
045: * It also contains rules to for tag balancing. For each tag, list of dependant
046: * tags may be defined. In order to more easely describe those rules, several
047: * prefixed are introduced.
048: * </p>
049: * <p>
050: * For each tag, list of dependant tags may be specified using following prefixes:
051: * <ul>
052: * <li>
053: * <h3>!</h3> fatal tag - required outer tag - the tag will be ignored during
054: * parsing (will be skipped) if this fatal tag is missing. For example, most web
055: * browsers ignore elements TD, TR, TBODY if they are not in the context of TABLE tag.
056: * </li>
057: * <li>
058: * <h3>+</h3> required enclosing tag - if there is no such, it is implicitely
059: * created. For example if TD is out of TR - open TR is created before.
060: * </li>
061: * <li>
062: * <h3>-</h3> permitted tag - it is not allowed to occure inside - for example
063: * FORM cannot be inside other FORM and it will be ignored during cleanup.
064: * </li>
065: * <li>
066: * <h3>#</h3> allowed children tags - for example TR allowes TD and TH. If there
067: * are some dependant allowed tags defined then cleaner ignores other tags, treating
068: * them as unallowed, unless they are in some other relationship with this tag.
069: * </li>
070: * <li>
071: * <h3>^</h3> higher level tags - for example for TR higher tags are THEAD, TBODY, TFOOT.
072: * </li>
073: * <li>
074: * <h3>&</h3> tags that must be closed and copied - for example, in
075: * <code><a href="#"><div>....</code> tag A must be closed before DIV but
076: * copied again inside DIV.
077: * </li>
078: * </ul>
079: * </p>
080: *
081: * <p>
082: * Tag TR for instance (table row) may define the following dependancies:
083: * <code>!table,+tbody,^thead,^tfoot,#td,#th,tr,caption,colgroup</code>
084: * meaning the following: <br>
085: * <li>TR must be in context of TABLE, otherwise it will be ignored,</li>
086: * <li>TR may can be directly inside TBODY, TFOOT and THEAD, otherwise TBODY will be
087: * implicitely created in front of it.</li>
088: * <li>TR can contain TD and TD, all other tags and content will be pushed out of current
089: * limiting context, in the case of html tables, in front of enclosing TABLE tag.</li>
090: * <li>if previous open tag is one of TR, CAPTION or COLGROUP, it will be implicitely closed.</li>
091: * </p>
092: *
093: * Created by Vladimir Nikic.<br/>
094: * Date: November, 2006
095: */
096: public class TagInfo {
097:
098: static final int HEAD_AND_BODY = 0;
099: static final int HEAD = 1;
100: static final int BODY = 2;
101:
102: static String CONTENT_ALL = "ALL";
103: static String CONTENT_NONE = "NONE";
104: static String CONTENT_TEXT = "TEXT";
105:
106: private String name;
107: private String contentType;
108: private Set mustCloseTags = new HashSet();
109: private Set higherTags = new HashSet();
110: private Set childTags = new HashSet();
111: private Set permittedTags = new HashSet();
112: private Set copyTags = new HashSet();
113: private int belongsTo = BODY;
114: private String requiredParent = null;
115: private String fatalTag = null;
116: private boolean deprecated = false;
117: private boolean unique = false;
118: private boolean ignorePermitted = false;
119:
120: public TagInfo(String name, String contentType, int belongsTo,
121: boolean depricated, boolean unique,
122: boolean ignorePermitted, String dependancies) {
123: this .name = name;
124: this .contentType = contentType;
125: this .belongsTo = belongsTo;
126: this .deprecated = depricated;
127: this .unique = unique;
128: this .ignorePermitted = ignorePermitted;
129:
130: // defines dependant tags
131: if (dependancies != null) {
132: StringTokenizer tokenizer = new StringTokenizer(
133: dependancies, ",.;| ");
134: while (tokenizer.hasMoreTokens()) {
135: String currTag = tokenizer.nextToken().toLowerCase();
136: addDependancy(currTag);
137: }
138: }
139: }
140:
141: public void addDependancy(String dependantTagName) {
142: if (dependantTagName.startsWith("!")) {
143: String tagName = dependantTagName.substring(1);
144: this .fatalTag = tagName;
145: this .higherTags.add(tagName);
146: } else if (dependantTagName.startsWith("+")) {
147: String tagName = dependantTagName.substring(1);
148: this .requiredParent = dependantTagName.substring(1);
149: this .higherTags.add(tagName);
150: } else if (dependantTagName.startsWith("-")) {
151: this .permittedTags.add(dependantTagName.substring(1));
152: } else if (dependantTagName.startsWith("#")) {
153: this .childTags.add(dependantTagName.substring(1));
154: } else if (dependantTagName.startsWith("^")) {
155: this .higherTags.add(dependantTagName.substring(1));
156: } else if (dependantTagName.startsWith("&")) {
157: String tagName = dependantTagName.substring(1);
158: this .copyTags.add(tagName);
159: this .mustCloseTags.add(tagName);
160: } else if (!"".equals(dependantTagName.trim())) {
161: this .mustCloseTags.add(dependantTagName);
162: }
163: }
164:
165: // getters and setters
166:
167: public String getName() {
168: return name;
169: }
170:
171: public void setName(String name) {
172: this .name = name;
173: }
174:
175: public String getContentType() {
176: return contentType;
177: }
178:
179: public void setContentType(String contentType) {
180: this .contentType = contentType;
181: }
182:
183: public Set getMustCloseTags() {
184: return mustCloseTags;
185: }
186:
187: public void setMustCloseTags(Set mustCloseTags) {
188: this .mustCloseTags = mustCloseTags;
189: }
190:
191: public Set getHigherTags() {
192: return higherTags;
193: }
194:
195: public void setHigherTags(Set higherTags) {
196: this .higherTags = higherTags;
197: }
198:
199: public Set getChildTags() {
200: return childTags;
201: }
202:
203: public void setChildTags(Set childTags) {
204: this .childTags = childTags;
205: }
206:
207: public Set getPermittedTags() {
208: return permittedTags;
209: }
210:
211: public void setPermittedTags(Set permittedTags) {
212: this .permittedTags = permittedTags;
213: }
214:
215: public Set getCopyTags() {
216: return copyTags;
217: }
218:
219: public void setCopyTags(Set copyTags) {
220: this .copyTags = copyTags;
221: }
222:
223: public String getRequiredParent() {
224: return requiredParent;
225: }
226:
227: public void setRequiredParent(String requiredParent) {
228: this .requiredParent = requiredParent;
229: }
230:
231: public int getBelongsTo() {
232: return belongsTo;
233: }
234:
235: public void setBelongsTo(int belongsTo) {
236: this .belongsTo = belongsTo;
237: }
238:
239: public String getFatalTag() {
240: return fatalTag;
241: }
242:
243: public void setFatalTag(String fatalTag) {
244: this .fatalTag = fatalTag;
245: }
246:
247: public boolean isDeprecated() {
248: return deprecated;
249: }
250:
251: public void setDeprecated(boolean deprecated) {
252: this .deprecated = deprecated;
253: }
254:
255: public boolean isUnique() {
256: return unique;
257: }
258:
259: public void setUnique(boolean unique) {
260: this .unique = unique;
261: }
262:
263: public boolean isIgnorePermitted() {
264: return ignorePermitted;
265: }
266:
267: public void setIgnorePermitted(boolean ignorePermitted) {
268: this .ignorePermitted = ignorePermitted;
269: }
270:
271: // other functionality
272:
273: boolean allowsBody() {
274: return !CONTENT_NONE.equals(contentType);
275: }
276:
277: boolean isHigher(String tagName) {
278: return higherTags.contains(tagName);
279: }
280:
281: boolean isCopy(String tagName) {
282: return copyTags.contains(tagName);
283: }
284:
285: boolean hasCopyTags() {
286: return !copyTags.isEmpty();
287: }
288:
289: boolean hasPermittedTags() {
290: return !permittedTags.isEmpty();
291: }
292:
293: boolean isHeadTag() {
294: return belongsTo == HEAD;
295: }
296:
297: boolean isHeadAndBodyTag() {
298: return belongsTo == HEAD || belongsTo == HEAD_AND_BODY;
299: }
300:
301: boolean isMustCloseTag(TagInfo tagInfo) {
302: if (tagInfo != null) {
303: return mustCloseTags.contains(tagInfo.getName())
304: || tagInfo.contentType == CONTENT_TEXT;
305: }
306:
307: return false;
308: }
309:
310: boolean allowsItem(BaseToken token) {
311: if (contentType != CONTENT_NONE && token instanceof TagToken) {
312: TagToken tagToken = (TagToken) token;
313: String tagName = tagToken.getName();
314: if ("script".equals(tagName)) {
315: return true;
316: }
317: }
318:
319: if (contentType == CONTENT_ALL) {
320: if (!childTags.isEmpty()) {
321: return token instanceof TagToken ? childTags
322: .contains(((TagToken) token).getName()) : false;
323: } else if (!permittedTags.isEmpty()) {
324: return token instanceof TagToken ? !permittedTags
325: .contains(((TagToken) token).getName()) : true;
326: } else {
327: return true;
328: }
329: } else if (contentType == CONTENT_TEXT) {
330: return !(token instanceof TagToken);
331: }
332:
333: return false;
334: }
335:
336: boolean allowsAnything() {
337: return contentType == CONTENT_ALL && childTags.size() == 0;
338: }
339:
340: }
|