001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 1999 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: // Aug 21, 2000:
059: // Fixed bug in isElement and made HTMLdtd public.
060: // Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
061:
062: package org.apache.xml.serialize;
063:
064: import java.io.InputStream;
065: import java.io.InputStreamReader;
066: import java.io.BufferedReader;
067: import java.util.Hashtable;
068:
069: /**
070: * Utility class for accessing information specific to HTML documents.
071: * The HTML DTD is expressed as three utility function groups. Two methods
072: * allow for checking whether an element requires an open tag on printing
073: * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
074: * <P>
075: * Two other methods translate character references from name to value and
076: * from value to name. A small entities resource is loaded into memory the
077: * first time any of these methods is called for fast and efficient access.
078: *
079: *
080: * @version $Revision: 1.12 $ $Date: 2001/04/18 06:02:49 $
081: * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
082: */
083: public final class HTMLdtd {
084:
085: /**
086: * Public identifier for HTML document type.
087: */
088: public static final String HTMLPublicId = "-//W3C//DTD HTML 4.0//EN";
089:
090: /**
091: * System identifier for HTML document type.
092: */
093: public static final String HTMLSystemId = "http://www.w3.org/TR/WD-html-in-xml/DTD/xhtml1-strict.dtd";
094:
095: /**
096: * Public identifier for XHTML document type.
097: */
098: public static final String XHTMLPublicId = "-//W3C//DTD XHTML 1.0 Strict//EN";
099:
100: /**
101: * System identifier for XHTML document type.
102: */
103: public static final String XHTMLSystemId = "http://www.w3.org/TR/WD-html-in-xml/DTD/xhtml1-strict.dtd";
104: /**
105: * Table of reverse character reference mapping. Character codes are held
106: * as single-character strings, mapped to their reference name.
107: */
108: private static Hashtable _byChar;
109:
110: /**
111: * Table of entity name to value mapping. Entities are held as strings,
112: * character references as <TT>Character</TT> objects.
113: */
114: private static Hashtable _byName;
115:
116: private static Hashtable _boolAttrs;
117:
118: /**
119: * Holds element definitions.
120: */
121: private static Hashtable _elemDefs;
122:
123: /**
124: * Locates the HTML entities file that is loaded upon initialization.
125: * This file is a resource loaded with the default class loader.
126: */
127: private static final String ENTITIES_RESOURCE = "HTMLEntities.res";
128:
129: /**
130: * Only opening tag should be printed.
131: */
132: private static final int ONLY_OPENING = 0x0001;
133:
134: /**
135: * Element contains element content only.
136: */
137: private static final int ELEM_CONTENT = 0x0002;
138:
139: /**
140: * Element preserve spaces.
141: */
142: private static final int PRESERVE = 0x0004;
143:
144: /**
145: * Optional closing tag.
146: */
147: private static final int OPT_CLOSING = 0x0008;
148:
149: /**
150: * Element is empty (also means only opening tag)
151: */
152: private static final int EMPTY = 0x0010 | ONLY_OPENING;
153:
154: /**
155: * Allowed to appear in head.
156: */
157: private static final int ALLOWED_HEAD = 0x0020;
158:
159: /**
160: * When opened, closes P.
161: */
162: private static final int CLOSE_P = 0x0040;
163:
164: /**
165: * When opened, closes DD or DT.
166: */
167: private static final int CLOSE_DD_DT = 0x0080;
168:
169: /**
170: * When opened, closes itself.
171: */
172: private static final int CLOSE_SELF = 0x0100;
173:
174: /**
175: * When opened, closes another table section.
176: */
177: private static final int CLOSE_TABLE = 0x0200;
178:
179: /**
180: * When opened, closes TH or TD.
181: */
182: private static final int CLOSE_TH_TD = 0x04000;
183:
184: /**
185: * Returns true if element is declared to be empty. HTML elements are
186: * defines as empty in the DTD, not by the document syntax.
187: *
188: * @param tagName The element tag name (upper case)
189: * @return True if element is empty
190: */
191: public static boolean isEmptyTag(String tagName) {
192: return isElement(tagName, EMPTY);
193: }
194:
195: /**
196: * Returns true if element is declared to have element content.
197: * Whitespaces appearing inside element content will be ignored,
198: * other text will simply report an error.
199: *
200: * @param tagName The element tag name (upper case)
201: * @return True if element content
202: */
203: public static boolean isElementContent(String tagName) {
204: return isElement(tagName, ELEM_CONTENT);
205: }
206:
207: /**
208: * Returns true if element's textual contents preserves spaces.
209: * This only applies to PRE and TEXTAREA, all other HTML elements
210: * do not preserve space.
211: *
212: * @param tagName The element tag name (upper case)
213: * @return True if element's text content preserves spaces
214: */
215: public static boolean isPreserveSpace(String tagName) {
216: return isElement(tagName, PRESERVE);
217: }
218:
219: /**
220: * Returns true if element's closing tag is optional and need not
221: * exist. An error will not be reported for such elements if they
222: * are not closed. For example, <tt>LI</tt> is most often not closed.
223: *
224: * @param tagName The element tag name (upper case)
225: * @return True if closing tag implied
226: */
227: public static boolean isOptionalClosing(String tagName) {
228: return isElement(tagName, OPT_CLOSING);
229: }
230:
231: /**
232: * Returns true if element's closing tag is generally not printed.
233: * For example, <tt>LI</tt> should not print the closing tag.
234: *
235: * @param tagName The element tag name (upper case)
236: * @return True if only opening tag should be printed
237: */
238: public static boolean isOnlyOpening(String tagName) {
239: return isElement(tagName, ONLY_OPENING);
240: }
241:
242: /**
243: * Returns true if the opening of one element (<tt>tagName</tt>) implies
244: * the closing of another open element (<tt>openTag</tt>). For example,
245: * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
246: * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
247: *
248: * @param tagName The newly opened element
249: * @param openTag The already opened element
250: * @return True if closing tag closes opening tag
251: */
252: public static boolean isClosing(String tagName, String openTag) {
253: // Several elements are defined as closing the HEAD
254: if (openTag.equalsIgnoreCase("HEAD"))
255: return !isElement(tagName, ALLOWED_HEAD);
256: // P closes iteself
257: if (openTag.equalsIgnoreCase("P"))
258: return isElement(tagName, CLOSE_P);
259: // DT closes DD, DD closes DT
260: if (openTag.equalsIgnoreCase("DT")
261: || openTag.equalsIgnoreCase("DD"))
262: return isElement(tagName, CLOSE_DD_DT);
263: // LI and OPTION close themselves
264: if (openTag.equalsIgnoreCase("LI")
265: || openTag.equalsIgnoreCase("OPTION"))
266: return isElement(tagName, CLOSE_SELF);
267: // Each of these table sections closes all the others
268: if (openTag.equalsIgnoreCase("THEAD")
269: || openTag.equalsIgnoreCase("TFOOT")
270: || openTag.equalsIgnoreCase("TBODY")
271: || openTag.equalsIgnoreCase("TR")
272: || openTag.equalsIgnoreCase("COLGROUP"))
273: return isElement(tagName, CLOSE_TABLE);
274: // TD closes TH and TH closes TD
275: if (openTag.equalsIgnoreCase("TH")
276: || openTag.equalsIgnoreCase("TD"))
277: return isElement(tagName, CLOSE_TH_TD);
278: return false;
279: }
280:
281: /**
282: * Returns true if the specified attribute it a URI and should be
283: * escaped appropriately. In HTML URIs are escaped differently
284: * than normal attributes.
285: *
286: * @param tagName The element's tag name
287: * @param attrName The attribute's name
288: */
289: public static boolean isURI(String tagName, String attrName) {
290: // Stupid checks.
291: return (attrName.equalsIgnoreCase("href") || attrName
292: .equalsIgnoreCase("src"));
293: }
294:
295: /**
296: * Returns true if the specified attribute is a boolean and should be
297: * printed without the value. This applies to attributes that are true
298: * if they exist, such as selected (OPTION/INPUT).
299: *
300: * @param tagName The element's tag name
301: * @param attrName The attribute's name
302: */
303: public static boolean isBoolean(String tagName, String attrName) {
304: String[] attrNames;
305:
306: attrNames = (String[]) _boolAttrs.get(tagName.toUpperCase());
307: if (attrNames == null)
308: return false;
309: for (int i = 0; i < attrNames.length; ++i)
310: if (attrNames[i].equalsIgnoreCase(attrName))
311: return true;
312: return false;
313: }
314:
315: /**
316: * Returns the value of an HTML character reference by its name. If the
317: * reference is not found or was not defined as a character reference,
318: * returns EOF (-1).
319: *
320: * @param name Name of character reference
321: * @return Character code or EOF (-1)
322: */
323: public static int charFromName(String name) {
324: Object value;
325:
326: initialize();
327: value = _byName.get(name);
328: if (value != null && value instanceof Integer)
329: return ((Integer) value).intValue();
330: else
331: return -1;
332: }
333:
334: /**
335: * Returns the name of an HTML character reference based on its character
336: * value. Only valid for entities defined from character references. If no
337: * such character value was defined, return null.
338: *
339: * @param value Character value of entity
340: * @return Entity's name or null
341: */
342: public static String fromChar(int value) {
343: if (value > 0xffff)
344: return null;
345:
346: String name;
347:
348: initialize();
349: name = (String) _byChar.get(new Integer(value));
350: return name;
351: }
352:
353: /**
354: * Initialize upon first access. Will load all the HTML character references
355: * into a list that is accessible by name or character value and is optimized
356: * for character substitution. This method may be called any number of times
357: * but will execute only once.
358: */
359: private static void initialize() {
360: InputStream is = null;
361: BufferedReader reader = null;
362: int index;
363: String name;
364: String value;
365: int code;
366: String line;
367:
368: // Make sure not to initialize twice.
369: if (_byName != null)
370: return;
371: try {
372: _byName = new Hashtable();
373: _byChar = new Hashtable();
374: is = HTMLdtd.class.getResourceAsStream(ENTITIES_RESOURCE);
375: if (is == null)
376: throw new RuntimeException("SER003 The resource ["
377: + ENTITIES_RESOURCE + "] could not be found.\n"
378: + ENTITIES_RESOURCE);
379: reader = new BufferedReader(new InputStreamReader(is));
380: line = reader.readLine();
381: while (line != null) {
382: if (line.length() == 0 || line.charAt(0) == '#') {
383: line = reader.readLine();
384: continue;
385: }
386: index = line.indexOf(' ');
387: if (index > 1) {
388: name = line.substring(0, index);
389: ++index;
390: if (index < line.length()) {
391: value = line.substring(index);
392: index = value.indexOf(' ');
393: if (index > 0)
394: value = value.substring(0, index);
395: code = Integer.parseInt(value);
396: defineEntity(name, (char) code);
397: }
398: }
399: line = reader.readLine();
400: }
401: is.close();
402: } catch (Exception except) {
403: throw new RuntimeException("SER003 The resource ["
404: + ENTITIES_RESOURCE + "] could not load: "
405: + except.toString() + "\n" + ENTITIES_RESOURCE
406: + "\t" + except.toString());
407: } finally {
408: if (is != null) {
409: try {
410: is.close();
411: } catch (Exception except) {
412: }
413: }
414: }
415: }
416:
417: /**
418: * Defines a new character reference. The reference's name and value are
419: * supplied. Nothing happens if the character reference is already defined.
420: * <P>
421: * Unlike internal entities, character references are a string to single
422: * character mapping. They are used to map non-ASCII characters both on
423: * parsing and printing, primarily for HTML documents. '<amp;' is an
424: * example of a character reference.
425: *
426: * @param name The entity's name
427: * @param value The entity's value
428: */
429: private static void defineEntity(String name, char value) {
430: if (_byName.get(name) == null) {
431: _byName.put(name, new Integer(value));
432: _byChar.put(new Integer(value), name);
433: }
434: }
435:
436: private static void defineElement(String name, int flags) {
437: _elemDefs.put(name, new Integer(flags));
438: }
439:
440: private static void defineBoolean(String tagName, String attrName) {
441: defineBoolean(tagName, new String[] { attrName });
442: }
443:
444: private static void defineBoolean(String tagName, String[] attrNames) {
445: _boolAttrs.put(tagName, attrNames);
446: }
447:
448: private static boolean isElement(String name, int flag) {
449: Integer flags;
450:
451: flags = (Integer) _elemDefs.get(name.toUpperCase());
452: if (flags == null)
453: return false;
454: else
455: return ((flags.intValue() & flag) == flag);
456: }
457:
458: static {
459: _elemDefs = new Hashtable();
460: defineElement("ADDRESS", CLOSE_P);
461: defineElement("AREA", EMPTY);
462: defineElement("BASE", EMPTY | ALLOWED_HEAD);
463: defineElement("BASEFONT", EMPTY);
464: defineElement("BLOCKQUOTE", CLOSE_P);
465: defineElement("BODY", OPT_CLOSING);
466: defineElement("BR", EMPTY);
467: defineElement("COL", EMPTY);
468: defineElement("COLGROUP", ELEM_CONTENT | OPT_CLOSING
469: | CLOSE_TABLE);
470: defineElement("DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT);
471: defineElement("DIV", CLOSE_P);
472: defineElement("DL", ELEM_CONTENT | CLOSE_P);
473: defineElement("DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT);
474: defineElement("FIELDSET", CLOSE_P);
475: defineElement("FORM", CLOSE_P);
476: defineElement("FRAME", EMPTY | OPT_CLOSING);
477: defineElement("H1", CLOSE_P);
478: defineElement("H2", CLOSE_P);
479: defineElement("H3", CLOSE_P);
480: defineElement("H4", CLOSE_P);
481: defineElement("H5", CLOSE_P);
482: defineElement("H6", CLOSE_P);
483: defineElement("HEAD", ELEM_CONTENT | OPT_CLOSING);
484: defineElement("HR", EMPTY | CLOSE_P);
485: defineElement("HTML", ELEM_CONTENT | OPT_CLOSING);
486: defineElement("IMG", EMPTY);
487: defineElement("INPUT", EMPTY);
488: defineElement("ISINDEX", EMPTY | ALLOWED_HEAD);
489: defineElement("LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF);
490: defineElement("LINK", EMPTY | ALLOWED_HEAD);
491: defineElement("MAP", ALLOWED_HEAD);
492: defineElement("META", EMPTY | ALLOWED_HEAD);
493: defineElement("OL", ELEM_CONTENT | CLOSE_P);
494: defineElement("OPTGROUP", ELEM_CONTENT);
495: defineElement("OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF);
496: defineElement("P", OPT_CLOSING | CLOSE_P | CLOSE_SELF);
497: defineElement("PARAM", EMPTY);
498: defineElement("PRE", PRESERVE | CLOSE_P);
499: defineElement("SCRIPT", ALLOWED_HEAD | PRESERVE);
500: defineElement("NOSCRIPT", ALLOWED_HEAD | PRESERVE);
501: defineElement("SELECT", ELEM_CONTENT);
502: defineElement("STYLE", ALLOWED_HEAD | PRESERVE);
503: defineElement("TABLE", ELEM_CONTENT | CLOSE_P);
504: defineElement("TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE);
505: defineElement("TD", OPT_CLOSING | CLOSE_TH_TD);
506: defineElement("TEXTAREA", PRESERVE);
507: defineElement("TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE);
508: defineElement("TH", OPT_CLOSING | CLOSE_TH_TD);
509: defineElement("THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE);
510: defineElement("TITLE", ALLOWED_HEAD);
511: defineElement("TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE);
512: defineElement("UL", ELEM_CONTENT | CLOSE_P);
513:
514: _boolAttrs = new Hashtable();
515: defineBoolean("AREA", "href");
516: defineBoolean("BUTTON", "disabled");
517: defineBoolean("DIR", "compact");
518: defineBoolean("DL", "compact");
519: defineBoolean("FRAME", "noresize");
520: defineBoolean("HR", "noshade");
521: defineBoolean("IMAGE", "ismap");
522: defineBoolean("INPUT", new String[] { "defaultchecked",
523: "checked", "readonly", "disabled" });
524: defineBoolean("LINK", "link");
525: defineBoolean("MENU", "compact");
526: defineBoolean("OBJECT", "declare");
527: defineBoolean("OL", "compact");
528: defineBoolean("OPTGROUP", "disabled");
529: defineBoolean("OPTION", new String[] { "default-selected",
530: "selected", "disabled" });
531: defineBoolean("SCRIPT", "defer");
532: defineBoolean("SELECT", new String[] { "multiple", "disabled" });
533: defineBoolean("STYLE", "disabled");
534: defineBoolean("TD", "nowrap");
535: defineBoolean("TH", "nowrap");
536: defineBoolean("TEXTAREA",
537: new String[] { "disabled", "readonly" });
538: defineBoolean("UL", "compact");
539:
540: initialize();
541: }
542:
543: }
|