001: /*
002: * Copyright 1999-2002,2004 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: // Aug 21, 2000:
018: // Fixed bug in isElement and made HTMLdtd public.
019: // Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
020:
021: package org.jasig.portal.serialize;
022:
023: import org.apache.xerces.dom.DOMMessageFormatter;
024:
025: import java.io.InputStream;
026: import java.io.InputStreamReader;
027: import java.io.BufferedReader;
028: import java.util.Collections;
029: import java.util.HashMap;
030: import java.util.Locale;
031: import java.util.Map;
032:
033: /**
034: * Utility class for accessing information specific to HTML documents.
035: * The HTML DTD is expressed as three utility function groups. Two methods
036: * allow for checking whether an element requires an open tag on printing
037: * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
038: * <P>
039: * Two other methods translate character references from name to value and
040: * from value to name. A small entities resource is loaded into memory the
041: * first time any of these methods is called for fast and efficient access.
042: *
043: *
044: * @version $Revision: 36559 $ $Date: 2006-04-28 11:38:13 -0700 (Fri, 28 Apr 2006) $
045: * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
046: */
047: public final class HTMLdtd {
048:
049: /**
050: * Public identifier for HTML 4.01 (Strict) document type.
051: */
052: public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
053:
054: /**
055: * System identifier for HTML 4.01 (Strict) document type.
056: */
057: public static final String HTMLSystemId = "http://www.w3.org/TR/html4/strict.dtd";
058:
059: /**
060: * Public identifier for XHTML 1.0 (Strict) document type.
061: */
062: public static final String XHTMLPublicId = "-//W3C//DTD XHTML 1.0 Strict//EN";
063:
064: /**
065: * System identifier for XHTML 1.0 (Strict) document type.
066: */
067: public static final String XHTMLSystemId = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
068:
069: /**
070: * Table of reverse character reference mapping. Character codes are held
071: * as single-character strings, mapped to their reference name.
072: */
073: private static String[] _entity;
074:
075: /**
076: * Table of entity name to value mapping. Entities are held as strings,
077: * character references as <TT>Character</TT> objects.
078: */
079: private static Map _byName;
080:
081: private static Map _boolAttrs;
082:
083: /**
084: * Holds element definitions.
085: */
086: private static Map _elemDefs;
087:
088: /**
089: * Locates the HTML entities file that is loaded upon initialization.
090: * This file is a resource loaded with the default class loader.
091: */
092: private static final String ENTITIES_RESOURCE = "HTMLEntities.res";
093:
094: /**
095: * Only opening tag should be printed.
096: */
097: private static final int ONLY_OPENING = 0x0001;
098:
099: /**
100: * Element contains element content only.
101: */
102: private static final int ELEM_CONTENT = 0x0002;
103:
104: /**
105: * Element preserve spaces.
106: */
107: private static final int PRESERVE = 0x0004;
108:
109: /**
110: * Optional closing tag.
111: */
112: private static final int OPT_CLOSING = 0x0008;
113:
114: /**
115: * Element is empty (also means only opening tag)
116: */
117: private static final int EMPTY = 0x0010 | ONLY_OPENING;
118:
119: /**
120: * Allowed to appear in head.
121: */
122: private static final int ALLOWED_HEAD = 0x0020;
123:
124: /**
125: * When opened, closes P.
126: */
127: private static final int CLOSE_P = 0x0040;
128:
129: /**
130: * When opened, closes DD or DT.
131: */
132: private static final int CLOSE_DD_DT = 0x0080;
133:
134: /**
135: * When opened, closes itself.
136: */
137: private static final int CLOSE_SELF = 0x0100;
138:
139: /**
140: * When opened, closes another table section.
141: */
142: private static final int CLOSE_TABLE = 0x0200;
143:
144: /**
145: * When opened, closes TH or TD.
146: */
147: private static final int CLOSE_TH_TD = 0x04000;
148:
149: /**
150: * Returns true if element is declared to be empty. HTML elements are
151: * defines as empty in the DTD, not by the document syntax.
152: *
153: * @param tagName The element tag name (upper case)
154: * @return True if element is empty
155: */
156: public static boolean isEmptyTag(String tagName) {
157: return isElement(tagName, EMPTY);
158: }
159:
160: /**
161: * Returns true if element is declared to have element content.
162: * Whitespaces appearing inside element content will be ignored,
163: * other text will simply report an error.
164: *
165: * @param tagName The element tag name (upper case)
166: * @return True if element content
167: */
168: public static boolean isElementContent(String tagName) {
169: return isElement(tagName, ELEM_CONTENT);
170: }
171:
172: /**
173: * Returns true if element's textual contents preserves spaces.
174: * This only applies to PRE and TEXTAREA, all other HTML elements
175: * do not preserve space.
176: *
177: * @param tagName The element tag name (upper case)
178: * @return True if element's text content preserves spaces
179: */
180: public static boolean isPreserveSpace(String tagName) {
181: return isElement(tagName, PRESERVE);
182: }
183:
184: /**
185: * Returns true if element's closing tag is optional and need not
186: * exist. An error will not be reported for such elements if they
187: * are not closed. For example, <tt>LI</tt> is most often not closed.
188: *
189: * @param tagName The element tag name (upper case)
190: * @return True if closing tag implied
191: */
192: public static boolean isOptionalClosing(String tagName) {
193: return isElement(tagName, OPT_CLOSING);
194: }
195:
196: /**
197: * Returns true if element's closing tag is generally not printed.
198: * For example, <tt>LI</tt> should not print the closing tag.
199: *
200: * @param tagName The element tag name (upper case)
201: * @return True if only opening tag should be printed
202: */
203: public static boolean isOnlyOpening(String tagName) {
204: return isElement(tagName, ONLY_OPENING);
205: }
206:
207: /**
208: * Returns true if the opening of one element (<tt>tagName</tt>) implies
209: * the closing of another open element (<tt>openTag</tt>). For example,
210: * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
211: * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
212: *
213: * @param tagName The newly opened element
214: * @param openTag The already opened element
215: * @return True if closing tag closes opening tag
216: */
217: public static boolean isClosing(String tagName, String openTag) {
218: // Several elements are defined as closing the HEAD
219: if (openTag.equalsIgnoreCase("HEAD"))
220: return !isElement(tagName, ALLOWED_HEAD);
221: // P closes iteself
222: if (openTag.equalsIgnoreCase("P"))
223: return isElement(tagName, CLOSE_P);
224: // DT closes DD, DD closes DT
225: if (openTag.equalsIgnoreCase("DT")
226: || openTag.equalsIgnoreCase("DD"))
227: return isElement(tagName, CLOSE_DD_DT);
228: // LI and OPTION close themselves
229: if (openTag.equalsIgnoreCase("LI")
230: || openTag.equalsIgnoreCase("OPTION"))
231: return isElement(tagName, CLOSE_SELF);
232: // Each of these table sections closes all the others
233: if (openTag.equalsIgnoreCase("THEAD")
234: || openTag.equalsIgnoreCase("TFOOT")
235: || openTag.equalsIgnoreCase("TBODY")
236: || openTag.equalsIgnoreCase("TR")
237: || openTag.equalsIgnoreCase("COLGROUP"))
238: return isElement(tagName, CLOSE_TABLE);
239: // TD closes TH and TH closes TD
240: if (openTag.equalsIgnoreCase("TH")
241: || openTag.equalsIgnoreCase("TD"))
242: return isElement(tagName, CLOSE_TH_TD);
243: return false;
244: }
245:
246: /**
247: * Returns true if the specified attribute it a URI and should be
248: * escaped appropriately. In HTML URIs are escaped differently
249: * than normal attributes.
250: *
251: * @param tagName The element's tag name
252: * @param attrName The attribute's name
253: */
254: public static boolean isURI(String tagName, String attrName) {
255: // Stupid checks.
256: return (attrName.equalsIgnoreCase("href") || attrName
257: .equalsIgnoreCase("src"));
258: }
259:
260: /**
261: * Returns true if the specified attribute is a boolean and should be
262: * printed without the value. This applies to attributes that are true
263: * if they exist, such as selected (OPTION/INPUT).
264: *
265: * @param tagName The element's tag name
266: * @param attrName The attribute's name
267: */
268: public static boolean isBoolean(String tagName, String attrName) {
269: String[] attrNames;
270:
271: attrNames = (String[]) _boolAttrs.get(tagName
272: .toUpperCase(Locale.ENGLISH));
273: if (attrNames == null)
274: return false;
275: for (int i = 0; i < attrNames.length; ++i)
276: if (attrNames[i].equalsIgnoreCase(attrName))
277: return true;
278: return false;
279: }
280:
281: /**
282: * Returns the value of an HTML character reference by its name. If the
283: * reference is not found or was not defined as a character reference,
284: * returns EOF (-1).
285: *
286: * @param name Name of character reference
287: * @return Character code or EOF (-1)
288: */
289: public static int charFromName(String name) {
290: Object value;
291:
292: initialize();
293: value = _byName.get(name);
294: if (value != null && value instanceof Integer)
295: return ((Integer) value).intValue();
296: else
297: return -1;
298: }
299:
300: /**
301: * Returns the name of an HTML character reference based on its character
302: * value. Only valid for entities defined from character references. If no
303: * such character value was defined, return null.
304: *
305: * @param value Character value of entity
306: * @return Entity's name or null
307: */
308: public static String fromChar(int value) {
309: if (value > 0xffff)
310: return null;
311:
312: initialize();
313: if (value < _entity.length) {
314: return _entity[value];
315: } else {
316: return null;
317: }
318: }
319:
320: /**
321: * Initialize upon first access. Will load all the HTML character references
322: * into a list that is accessible by name or character value and is optimized
323: * for character substitution. This method may be called any number of times
324: * but will execute only once.
325: */
326: private static void initialize() {
327: InputStream is = null;
328: BufferedReader reader = null;
329: int index;
330: String name;
331: String value;
332: int code;
333: String line;
334:
335: // Make sure not to initialize twice.
336: if (_entity != null)
337: return;
338: try {
339: _byName = new HashMap();
340: _entity = new String[10000];
341: is = HTMLdtd.class.getResourceAsStream(ENTITIES_RESOURCE);
342: if (is == null) {
343: throw new RuntimeException(DOMMessageFormatter
344: .formatMessage(
345: DOMMessageFormatter.SERIALIZER_DOMAIN,
346: "ResourceNotFound",
347: new Object[] { ENTITIES_RESOURCE }));
348: }
349: reader = new BufferedReader(new InputStreamReader(is,
350: "ASCII"));
351: line = reader.readLine();
352: while (line != null) {
353: if (line.length() == 0 || line.charAt(0) == '#') {
354: line = reader.readLine();
355: continue;
356: }
357: index = line.indexOf(' ');
358: if (index > 1) {
359: name = line.substring(0, index);
360: ++index;
361: if (index < line.length()) {
362: value = line.substring(index);
363: index = value.indexOf(' ');
364: if (index > 0)
365: value = value.substring(0, index);
366: code = Integer.parseInt(value);
367: defineEntity(name, (char) code);
368: }
369: }
370: line = reader.readLine();
371: }
372: is.close();
373: } catch (Exception except) {
374: throw new RuntimeException(DOMMessageFormatter
375: .formatMessage(
376: DOMMessageFormatter.SERIALIZER_DOMAIN,
377: "ResourceNotLoaded", new Object[] {
378: ENTITIES_RESOURCE,
379: except.toString() }));
380: } finally {
381: if (is != null) {
382: try {
383: is.close();
384: } catch (Exception except) {
385: }
386: }
387: }
388: // save only the unmodifiable map to the member variable.
389: _byName = Collections.unmodifiableMap(_byName);
390: }
391:
392: /**
393: * Defines a new character reference. The reference's name and value are
394: * supplied. Nothing happens if the character reference is already defined.
395: * <P>
396: * Unlike internal entities, character references are a string to single
397: * character mapping. They are used to map non-ASCII characters both on
398: * parsing and printing, primarily for HTML documents. '<amp;' is an
399: * example of a character reference.
400: *
401: * @param name The entity's name
402: * @param value The entity's value
403: */
404: private static void defineEntity(String name, char value) {
405: int intValue = (int) value;
406: if (intValue < _entity.length) {
407: if (_entity[intValue] == null) {
408:
409: _entity[intValue] = name;
410: }
411: } else {
412: /*
413: * increase the size of array and put the new
414: * entity name at the appropriate index value.
415: */
416: final String newArray[] = _entity;
417: _entity = new String[intValue + 1];
418: for (int i = 0, n = newArray.length; i < n; i++) {
419: _entity[i] = newArray[i];
420: }
421: _entity[intValue] = name;
422: }
423: }
424:
425: private static void defineElement(String name, int flags) {
426: _elemDefs.put(name, new Integer(flags));
427: }
428:
429: private static void defineBoolean(String tagName, String attrName) {
430: defineBoolean(tagName, new String[] { attrName });
431: }
432:
433: private static void defineBoolean(String tagName, String[] attrNames) {
434: _boolAttrs.put(tagName, attrNames);
435: }
436:
437: private static boolean isElement(String name, int flag) {
438: Integer flags;
439:
440: flags = (Integer) _elemDefs.get(name
441: .toUpperCase(Locale.ENGLISH));
442: if (flags == null)
443: return false;
444: else
445: return ((flags.intValue() & flag) == flag);
446: }
447:
448: static {
449: _elemDefs = new HashMap();
450: defineElement("ADDRESS", CLOSE_P);
451: defineElement("AREA", EMPTY);
452: defineElement("BASE", EMPTY | ALLOWED_HEAD);
453: defineElement("BASEFONT", EMPTY);
454: defineElement("BLOCKQUOTE", CLOSE_P);
455: defineElement("BODY", OPT_CLOSING);
456: defineElement("BR", EMPTY);
457: defineElement("COL", EMPTY);
458: defineElement("COLGROUP", ELEM_CONTENT | OPT_CLOSING
459: | CLOSE_TABLE);
460: defineElement("DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT);
461: defineElement("DIV", CLOSE_P);
462: defineElement("DL", ELEM_CONTENT | CLOSE_P);
463: defineElement("DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT);
464: defineElement("FIELDSET", CLOSE_P);
465: defineElement("FORM", CLOSE_P);
466: defineElement("FRAME", EMPTY | OPT_CLOSING);
467: defineElement("H1", CLOSE_P);
468: defineElement("H2", CLOSE_P);
469: defineElement("H3", CLOSE_P);
470: defineElement("H4", CLOSE_P);
471: defineElement("H5", CLOSE_P);
472: defineElement("H6", CLOSE_P);
473: defineElement("HEAD", ELEM_CONTENT | OPT_CLOSING);
474: defineElement("HR", EMPTY | CLOSE_P);
475: defineElement("HTML", ELEM_CONTENT | OPT_CLOSING);
476: defineElement("IMG", EMPTY);
477: defineElement("INPUT", EMPTY);
478: defineElement("ISINDEX", EMPTY | ALLOWED_HEAD);
479: defineElement("LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF);
480: defineElement("LINK", EMPTY | ALLOWED_HEAD);
481: defineElement("MAP", ALLOWED_HEAD);
482: defineElement("META", EMPTY | ALLOWED_HEAD);
483: defineElement("OL", ELEM_CONTENT | CLOSE_P);
484: defineElement("OPTGROUP", ELEM_CONTENT);
485: defineElement("OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF);
486: defineElement("P", OPT_CLOSING | CLOSE_P | CLOSE_SELF);
487: defineElement("PARAM", EMPTY);
488: defineElement("PRE", PRESERVE | CLOSE_P);
489: defineElement("SCRIPT", ALLOWED_HEAD | PRESERVE);
490: defineElement("NOSCRIPT", ALLOWED_HEAD | PRESERVE);
491: defineElement("SELECT", ELEM_CONTENT);
492: defineElement("STYLE", ALLOWED_HEAD | PRESERVE);
493: defineElement("TABLE", ELEM_CONTENT | CLOSE_P);
494: defineElement("TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE);
495: defineElement("TD", OPT_CLOSING | CLOSE_TH_TD);
496: defineElement("TEXTAREA", PRESERVE);
497: defineElement("TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE);
498: defineElement("TH", OPT_CLOSING | CLOSE_TH_TD);
499: defineElement("THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE);
500: defineElement("TITLE", ALLOWED_HEAD);
501: defineElement("TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE);
502: defineElement("UL", ELEM_CONTENT | CLOSE_P);
503: _elemDefs = Collections.unmodifiableMap(_elemDefs);
504: ;
505:
506: _boolAttrs = new HashMap();
507: defineBoolean("AREA", "href");
508: defineBoolean("BUTTON", "disabled");
509: defineBoolean("DIR", "compact");
510: defineBoolean("DL", "compact");
511: defineBoolean("FRAME", "noresize");
512: defineBoolean("HR", "noshade");
513: defineBoolean("IMAGE", "ismap");
514: defineBoolean("INPUT", new String[] { "defaultchecked",
515: "checked", "readonly", "disabled" });
516: defineBoolean("LINK", "link");
517: defineBoolean("MENU", "compact");
518: defineBoolean("OBJECT", "declare");
519: defineBoolean("OL", "compact");
520: defineBoolean("OPTGROUP", "disabled");
521: defineBoolean("OPTION", new String[] { "default-selected",
522: "selected", "disabled" });
523: defineBoolean("SCRIPT", "defer");
524: defineBoolean("SELECT", new String[] { "multiple", "disabled" });
525: defineBoolean("STYLE", "disabled");
526: defineBoolean("TD", "nowrap");
527: defineBoolean("TH", "nowrap");
528: defineBoolean("TEXTAREA",
529: new String[] { "disabled", "readonly" });
530: defineBoolean("UL", "compact");
531: _boolAttrs = Collections.unmodifiableMap(_boolAttrs);
532:
533: initialize();
534: }
535:
536: }
|