001: package net.sf.saxon.event;
002:
003: import net.sf.saxon.charcode.UnicodeCharacterSet;
004: import net.sf.saxon.om.FastStringBuffer;
005: import net.sf.saxon.om.NamePool;
006: import net.sf.saxon.trans.XPathException;
007: import net.sf.saxon.codenorm.Normalizer;
008:
009: /**
010: * This class is used as a filter on the serialization pipeline; it performs the function
011: * of escaping URI-valued attributes in HTML
012: * @author Michael H. Kay
013: */
014:
015: public class HTMLURIEscaper extends ProxyReceiver {
016:
017: /**
018: * Table of attributes whose value is a URL
019: */
020:
021: // we use two HashMaps to avoid unnecessary string concatenations
022: private static HTMLTagHashSet urlAttributes = new HTMLTagHashSet(47);
023: private static HTMLTagHashSet urlCombinations = new HTMLTagHashSet(
024: 101);
025:
026: static {
027: setUrlAttribute("form", "action");
028: setUrlAttribute("object", "archive");
029: setUrlAttribute("body", "background");
030: setUrlAttribute("q", "cite");
031: setUrlAttribute("blockquote", "cite");
032: setUrlAttribute("del", "cite");
033: setUrlAttribute("ins", "cite");
034: setUrlAttribute("object", "classid");
035: setUrlAttribute("object", "codebase");
036: setUrlAttribute("applet", "codebase");
037: setUrlAttribute("object", "data");
038: setUrlAttribute("button", "datasrc");
039: setUrlAttribute("div", "datasrc");
040: setUrlAttribute("input", "datasrc");
041: setUrlAttribute("object", "datasrc");
042: setUrlAttribute("select", "datasrc");
043: setUrlAttribute("span", "datasrc");
044: setUrlAttribute("table", "datasrc");
045: setUrlAttribute("textarea", "datasrc");
046: setUrlAttribute("script", "for");
047: setUrlAttribute("a", "href");
048: setUrlAttribute("a", "name"); // see second note in section B.2.1 of HTML 4 specification
049: setUrlAttribute("area", "href");
050: setUrlAttribute("link", "href");
051: setUrlAttribute("base", "href");
052: setUrlAttribute("img", "longdesc");
053: setUrlAttribute("frame", "longdesc");
054: setUrlAttribute("iframe", "longdesc");
055: setUrlAttribute("head", "profile");
056: setUrlAttribute("script", "src");
057: setUrlAttribute("input", "src");
058: setUrlAttribute("frame", "src");
059: setUrlAttribute("iframe", "src");
060: setUrlAttribute("img", "src");
061: setUrlAttribute("img", "usemap");
062: setUrlAttribute("input", "usemap");
063: setUrlAttribute("object", "usemap");
064: }
065:
066: private static void setUrlAttribute(String element, String attribute) {
067: urlAttributes.add(attribute);
068: urlCombinations.add(element + '+' + attribute);
069: }
070:
071: public boolean isUrlAttribute(int element, int attribute) {
072: if (pool == null) {
073: pool = getNamePool();
074: }
075: String attributeName = pool.getDisplayName(attribute);
076: if (!urlAttributes.contains(attributeName)) {
077: return false;
078: }
079: String elementName = pool.getDisplayName(element);
080: return urlCombinations.contains(elementName + '+'
081: + attributeName);
082: }
083:
084: protected int currentElement;
085: protected boolean escapeURIAttributes = true;
086: protected NamePool pool;
087:
088: /**
089: * Start of event stream
090: */
091:
092: public void open() throws XPathException {
093: super .open();
094: }
095:
096: /**
097: * Start of a document node.
098: */
099:
100: public void startDocument(int properties) throws XPathException {
101: super .startDocument(properties);
102: pool = getPipelineConfiguration().getConfiguration()
103: .getNamePool();
104: }
105:
106: /**
107: * Notify the start of an element
108: *
109: * @param nameCode integer code identifying the name of the element within the name pool.
110: * @param typeCode integer code identifying the element's type within the name pool.
111: * @param properties properties of the element node
112: */
113:
114: public void startElement(int nameCode, int typeCode,
115: int locationId, int properties) throws XPathException {
116: currentElement = nameCode;
117: getUnderlyingReceiver().startElement(nameCode, typeCode,
118: locationId, properties);
119: }
120:
121: /**
122: * Notify an attribute. Attributes are notified after the startElement event, and before any
123: * children. Namespaces and attributes may be intermingled.
124: *
125: * @param nameCode The name of the attribute, as held in the name pool
126: * @param typeCode The type of the attribute, as held in the name pool
127: * @param properties Bit significant value. The following bits are defined:
128: * <dd>DISABLE_ESCAPING</dd> <dt>Disable escaping for this attribute</dt>
129: * <dd>NO_SPECIAL_CHARACTERS</dd> <dt>Attribute value contains no special characters</dt>
130: * @throws IllegalStateException: attempt to output an attribute when there is no open element
131: * start tag
132: */
133:
134: public void attribute(int nameCode, int typeCode,
135: CharSequence value, int locationId, int properties)
136: throws XPathException {
137: if (escapeURIAttributes
138: && isUrlAttribute(currentElement, nameCode)
139: && (properties & ReceiverOptions.DISABLE_ESCAPING) == 0) {
140: getUnderlyingReceiver()
141: .attribute(
142: nameCode,
143: typeCode,
144: escapeURL(value),
145: locationId,
146: properties
147: | ReceiverOptions.DISABLE_CHARACTER_MAPS);
148: } else {
149: getUnderlyingReceiver().attribute(nameCode, typeCode,
150: value, locationId, properties);
151: }
152: }
153:
154: /**
155: * Escape a URI according to the HTML rules: that is, a non-ASCII character (specifically,
156: * a character outside the range 32 - 126) is replaced by the %HH encoding of the octets in
157: * its UTF-8 representation
158: * @param url the URI to be escaped
159: * @return the URI after escaping non-ASCII characters
160: */
161:
162: public static CharSequence escapeURL(CharSequence url) {
163: // optimize for the common case where the string is all ASCII characters
164: for (int i = url.length() - 1; i >= 0; i--) {
165: char ch = url.charAt(i);
166: if (ch < 32 || ch > 126) {
167: CharSequence normalized = new Normalizer(Normalizer.C)
168: .normalize(url);
169: return reallyEscapeURL(normalized);
170: }
171: }
172: return url;
173: }
174:
175: private static CharSequence reallyEscapeURL(CharSequence url) {
176: FastStringBuffer sb = new FastStringBuffer(url.length() + 20);
177: final String hex = "0123456789ABCDEF";
178: byte[] array = new byte[4];
179:
180: for (int i = 0; i < url.length(); i++) {
181: char ch = url.charAt(i);
182: if (ch < 32 || ch > 126) {
183: int used = UnicodeCharacterSet
184: .getUTF8Encoding(ch,
185: (i + 1 < url.length() ? url
186: .charAt(i + 1) : ' '), array);
187: for (int b = 0; b < used; b++) {
188: //int v = (array[b]>=0 ? array[b] : 256 + array[b]);
189: int v = ((int) array[b]) & 0xff;
190: sb.append('%');
191: sb.append(hex.charAt(v / 16));
192: sb.append(hex.charAt(v % 16));
193: }
194:
195: } else {
196: sb.append(ch);
197: }
198: }
199: return sb;
200: }
201: }
202:
203: //
204: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
205: // you may not use this file except in compliance with the License. You may obtain a copy of the
206: // License at http://www.mozilla.org/MPL/
207: //
208: // Software distributed under the License is distributed on an "AS IS" basis,
209: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
210: // See the License for the specific language governing rights and limitations under the License.
211: //
212: // The Original Code is: all this file.
213: //
214: // The Initial Developer of the Original Code is Michael H. Kay.
215: //
216: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
217: //
218: // Contributor(s): none.
219: //
|