001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of HtmlCleaner may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035: subject line.
036: */
037:
038: package org.htmlcleaner;
039:
040: import java.util.HashMap;
041: import java.util.StringTokenizer;
042:
043: /**
044: * <p>
045: * Default HTML tag info provider. Here the basic set of HTML tags is defined, including
046: * depricated tags and some Microsoft specific tags. Rules for tag balancing are similar
047: * to that used in most web-browsers.
048: * </p>
049: *
050: * Created by: Vladimir Nikic<br/>
051: * Date: November, 2006.
052: */
053: public class HtmlTagProvider extends HashMap implements
054: ITagInfoProvider {
055:
056: /**
057: *
058: */
059: private static final long serialVersionUID = -148180231621348450L;
060: // singleton instance, used if no other TagInfoProvider is specified
061: private static HtmlTagProvider _instance;
062:
063: /**
064: * Returns singleton instance of this class.
065: */
066: public static synchronized HtmlTagProvider getInstance() {
067: if (_instance == null) {
068: _instance = new HtmlTagProvider();
069: }
070:
071: return _instance;
072: }
073:
074: /**
075: * Default constructor - creates tags and rules for balancing.
076: */
077: public HtmlTagProvider() {
078: defineTags();
079: }
080:
081: /**
082: * Shortcut to creating TagInfo instance and storing it to the map.
083: * @param name
084: * @param contentType
085: * @param belongsTo
086: * @param dependancies
087: */
088: protected void addTag(String name, String contentType,
089: int belongsTo, String dependancies) {
090: this .put(name.toLowerCase(), new TagInfo(name, contentType,
091: belongsTo, false, false, false, dependancies));
092: }
093:
094: /**
095: * Definition of all HTML tags together with rules for tag balancing.
096: */
097: protected void defineTags() {
098: // Structure
099: addTag("div", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
100: addTag("span", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
101:
102: // Meta Information
103: addTag("meta", TagInfo.CONTENT_NONE, TagInfo.HEAD, null);
104: addTag("link", TagInfo.CONTENT_NONE, TagInfo.HEAD, null);
105: addTag("title", TagInfo.CONTENT_TEXT, TagInfo.HEAD, null);
106: addTag("style", TagInfo.CONTENT_ALL, TagInfo.HEAD, null);
107: addTag("bgsound", TagInfo.CONTENT_NONE, TagInfo.HEAD, null);
108:
109: // Text
110: addTag("h1", TagInfo.CONTENT_ALL, TagInfo.BODY,
111: "h1,h2,h3,h4,h5,h6");
112: addTag("h2", TagInfo.CONTENT_ALL, TagInfo.BODY,
113: "h1,h2,h3,h4,h5,h6");
114: addTag("h3", TagInfo.CONTENT_ALL, TagInfo.BODY,
115: "h1,h2,h3,h4,h5,h6");
116: addTag("h4", TagInfo.CONTENT_ALL, TagInfo.BODY,
117: "h1,h2,h3,h4,h5,h6");
118: addTag("h5", TagInfo.CONTENT_ALL, TagInfo.BODY,
119: "h1,h2,h3,h4,h5,h6");
120: addTag("h6", TagInfo.CONTENT_ALL, TagInfo.BODY,
121: "h1,h2,h3,h4,h5,h6");
122: addTag("p", TagInfo.CONTENT_ALL, TagInfo.BODY, "p");
123: addTag("strong", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
124: addTag("em", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
125: addTag("abbr", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
126: addTag("acronym", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
127: addTag("address", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
128: addTag("bdo", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
129: addTag("blockquote", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
130: addTag("cite", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
131: addTag("q", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
132: addTag("code", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
133: addTag("ins", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
134: addTag("del", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
135: addTag("dfn", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
136: addTag("kbd", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
137: addTag("pre", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
138: addTag("samp", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
139: addTag("listing", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
140: addTag("var", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
141: addTag("br", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
142: addTag("wbr", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
143: addTag("nobr", TagInfo.CONTENT_ALL, TagInfo.BODY, "nobr");
144: addTag("xmp", TagInfo.CONTENT_TEXT, TagInfo.BODY, null);
145:
146: // Links
147: addTag("a", TagInfo.CONTENT_ALL, TagInfo.BODY, "a");
148: addTag("base", TagInfo.CONTENT_NONE, TagInfo.HEAD, null);
149:
150: // Images and Objects
151: addTag("img", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
152: addTag("area", TagInfo.CONTENT_NONE, TagInfo.BODY, "!map,area");
153: addTag("map", TagInfo.CONTENT_ALL, TagInfo.BODY, "map");
154: addTag("object", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
155: addTag("param", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
156: addTag("applet", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
157: addTag("xml", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
158:
159: // Lists
160: addTag("ul", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
161: addTag("ol", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
162: addTag("li", TagInfo.CONTENT_ALL, TagInfo.BODY, "li");
163: addTag("dl", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
164: addTag("dt", TagInfo.CONTENT_ALL, TagInfo.BODY, "dt,dd");
165: addTag("dd", TagInfo.CONTENT_ALL, TagInfo.BODY, "dt,dd");
166: addTag("menu", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
167: addTag("dir", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
168:
169: // Tables
170: addTag(
171: "table",
172: TagInfo.CONTENT_ALL,
173: TagInfo.BODY,
174: "#tr,#tbody,#thead,#tfoot,#colgroup,#caption,#tr,tr,thead,tbody,tfoot,caption,colgroup,table");
175: addTag("tr", TagInfo.CONTENT_ALL, TagInfo.BODY,
176: "!table,+tbody,^thead,^tfoot,#td,#th,tr,caption,colgroup");
177: addTag("td", TagInfo.CONTENT_ALL, TagInfo.BODY,
178: "!table,+tr,td,th,caption,colgroup");
179: addTag("th", TagInfo.CONTENT_ALL, TagInfo.BODY,
180: "!table,+tr,td,th,caption,colgroup");
181: addTag("tbody", TagInfo.CONTENT_ALL, TagInfo.BODY,
182: "!table,#tr,td,th,tr,tbody,thead,tfoot,caption,colgroup");
183: addTag("thead", TagInfo.CONTENT_ALL, TagInfo.BODY,
184: "!table,#tr,td,th,tr,tbody,thead,tfoot,caption,colgroup");
185: addTag("tfoot", TagInfo.CONTENT_ALL, TagInfo.BODY,
186: "!table,#tr,td,th,tr,tbody,thead,tfoot,caption,colgroup");
187: addTag("col", TagInfo.CONTENT_NONE, TagInfo.BODY, "!colgroup");
188: addTag("colgroup", TagInfo.CONTENT_ALL, TagInfo.BODY,
189: "!table,#col,td,th,tr,tbody,thead,tfoot,caption,colgroup");
190: addTag("caption", TagInfo.CONTENT_ALL, TagInfo.BODY,
191: "!table,td,th,tr,tbody,thead,tfoot,caption,colgroup");
192:
193: // Forms
194: addTag("form", TagInfo.CONTENT_ALL, TagInfo.BODY,
195: "-form,option,optgroup,textarea,select,fieldset");
196: addTag("input", TagInfo.CONTENT_NONE, TagInfo.BODY,
197: "select,optgroup,option");
198: addTag("textarea", TagInfo.CONTENT_ALL, TagInfo.BODY,
199: "select,optgroup,option");
200: addTag("select", TagInfo.CONTENT_ALL, TagInfo.BODY,
201: "#option,#optgroup,option,optgroup,select");
202: addTag("option", TagInfo.CONTENT_TEXT, TagInfo.BODY,
203: "!select,option");
204: addTag("optgroup", TagInfo.CONTENT_ALL, TagInfo.BODY,
205: "!select,#option,optgroup");
206: addTag("button", TagInfo.CONTENT_ALL, TagInfo.BODY,
207: "select,optgroup,option");
208: addTag("label", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
209: addTag("fieldset", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
210: addTag("isindex", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
211:
212: // Scripting
213: addTag("script", TagInfo.CONTENT_ALL, TagInfo.HEAD_AND_BODY,
214: null);
215: addTag("noscript", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
216:
217: // Presentational
218: addTag("b", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
219: addTag("i", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
220: addTag("u", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
221: addTag("tt", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
222: addTag("sub", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
223: addTag("sup", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
224: addTag("big", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
225: addTag("small", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
226: addTag("strike", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
227: addTag("blink", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
228: addTag("marquee", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
229: addTag("s", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
230: addTag("hr", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
231: addTag("font", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
232: addTag("basefont", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
233: addTag("center", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
234:
235: addTag("comment", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
236: addTag("server", TagInfo.CONTENT_ALL, TagInfo.BODY, null);
237: addTag("iframe", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
238: addTag("embed", TagInfo.CONTENT_NONE, TagInfo.BODY, null);
239:
240: getTagInfo("title").setUnique(true);
241: getTagInfo("form").setIgnorePermitted(true);
242: getTagInfo("select").setIgnorePermitted(true);
243: getTagInfo("option").setIgnorePermitted(true);
244: getTagInfo("optgroup").setIgnorePermitted(true);
245:
246: String commonTags = "div,p,address,h1,h2,h3,h4,h5,h6,blockquote,pre,listing,ul,ol,li,dl,menu,dir,table,form,fieldset,isindex,marquee,center,embed,param,hr";
247:
248: addDependancy("p", commonTags);
249: addDependancy("address", commonTags);
250: addDependancy("label", commonTags);
251: addDependancy("abbr", commonTags);
252: addDependancy("acronym", commonTags);
253: addDependancy("dfn", commonTags);
254: addDependancy("kbd", commonTags);
255: addDependancy("samp", commonTags);
256: addDependancy("var", commonTags);
257: addDependancy("cite", commonTags);
258: addDependancy("code", commonTags);
259: addDependancy("param", commonTags);
260: addDependancy("xml", commonTags);
261:
262: addDependancy("&a", commonTags);
263: addDependancy("&bdo", commonTags);
264: addDependancy("&strong", commonTags);
265: addDependancy("&em", commonTags);
266: addDependancy("&q", commonTags);
267: addDependancy("&b", commonTags);
268: addDependancy("&i", commonTags);
269: addDependancy("&u", commonTags);
270: addDependancy("&tt", commonTags);
271: addDependancy("&sub", commonTags);
272: addDependancy("&sup", commonTags);
273: addDependancy("&big", commonTags);
274: addDependancy("&small", commonTags);
275: addDependancy("&strike", commonTags);
276: addDependancy("&s", commonTags);
277: addDependancy("&font", commonTags);
278:
279: getTagInfo("applet").setDeprecated(true);
280: getTagInfo("basefont").setDeprecated(true);
281: getTagInfo("center").setDeprecated(true);
282: getTagInfo("dir").setDeprecated(true);
283: getTagInfo("font").setDeprecated(true);
284: getTagInfo("isindex").setDeprecated(true);
285: getTagInfo("menu").setDeprecated(true);
286: getTagInfo("s").setDeprecated(true);
287: getTagInfo("strike").setDeprecated(true);
288: getTagInfo("u").setDeprecated(true);
289: }
290:
291: protected void addDependancy(String tagName, String tagList) {
292: if (tagList != null) {
293: StringTokenizer tokenizer = new StringTokenizer(tagList,
294: ",.");
295: while (tokenizer.hasMoreTokens()) {
296: TagInfo curr = getTagInfo(tokenizer.nextToken().trim());
297: curr.addDependancy(tagName);
298: }
299: }
300: }
301:
302: /**
303: * Implementation of the interface method.
304: * @param tagName
305: * @return TagInfo instance from the map, for the specified tag name.
306: */
307: public TagInfo getTagInfo(String tagName) {
308: if (tagName != null) {
309: return (TagInfo) get(tagName.toLowerCase());
310: }
311:
312: return null;
313: }
314:
315: }
|