001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.commons.betwixt;
018:
019: /**
020: * <p><code>XMLUtils</code> contains basic utility methods for XML.</p>
021: *
022: * <p>The code for {@link #isWellFormedXMLName} is based on code in
023: * <code>org.apache.xerces.util.XMLChar</code>
024: * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
025: * The authors of this class are credited below.</p>
026: *
027: * @author Glenn Marcy, IBM
028: * @author Andy Clark, IBM
029: * @author Eric Ye, IBM
030: * @author Arnaud Le Hors, IBM
031: * @author Rahul Srivastava, Sun Microsystems Inc.
032: *
033: * @author Robert Burrell Donkin
034: * @since 0.5
035: */
036: public class XMLUtils {
037:
038: // Constants
039: //-------------------------------------------------------------------------
040:
041: /** Escaped <code><</code> entity */
042: public static final String LESS_THAN_ENTITY = "<";
043: /** Escaped <code>></code> entity */
044: public static final String GREATER_THAN_ENTITY = ">";
045: /** Escaped <code>&</code> entity */
046: public static final String AMPERSAND_ENTITY = "&";
047: /** Escaped <code>'</code> entity */
048: public static final String APOSTROPHE_ENTITY = "'";
049: /** Escaped <code>"</code> entity */
050: public static final String QUOTE_ENTITY = """;
051:
052: // Used by isWellFormedXMLName
053: /** Name start character mask. */
054: private static final int MASK_NAME_START = 0x01;
055: /** Name character mask. */
056: private static final int MASK_NAME = 0x02;
057:
058: // Class attributes
059: //-------------------------------------------------------------------------
060:
061: /** Character flags. */
062: private static final byte[] CHARS = new byte[1 << 16];
063:
064: //
065: // Static initialization
066: //
067:
068: static {
069:
070: //
071: // [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
072: // CombiningChar | Extender
073: //
074:
075: int nameChar[] = { 0x002D, 0x002E, // '-' and '.'
076: };
077:
078: //
079: // [5] Name ::= (Letter | '_' | ':') (NameChar)*
080: //
081:
082: int nameStartChar[] = { 0x003A, 0x005F, // ':' and '_'
083: };
084:
085: //
086: // [84] Letter ::= BaseChar | Ideographic
087: //
088:
089: int letterRange[] = {
090: // BaseChar
091: 0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8,
092: 0x00F6, 0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148,
093: 0x014A, 0x017E, 0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4,
094: 0x01F5, 0x01FA, 0x0217, 0x0250, 0x02A8, 0x02BB, 0x02C1,
095: 0x0388, 0x038A, 0x038E, 0x03A1, 0x03A3, 0x03CE, 0x03D0,
096: 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C, 0x040E, 0x044F,
097: 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4, 0x04C7,
098: 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
099: 0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0,
100: 0x05EA, 0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A,
101: 0x0671, 0x06B7, 0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0,
102: 0x06D3, 0x06E5, 0x06E6, 0x0905, 0x0939, 0x0958, 0x0961,
103: 0x0985, 0x098C, 0x098F, 0x0990, 0x0993, 0x09A8, 0x09AA,
104: 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD, 0x09DF, 0x09E1,
105: 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10, 0x0A13,
106: 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
107: 0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85,
108: 0x0A8B, 0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0,
109: 0x0AB2, 0x0AB3, 0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F,
110: 0x0B10, 0x0B13, 0x0B28, 0x0B2A, 0x0B30, 0x0B32, 0x0B33,
111: 0x0B36, 0x0B39, 0x0B5C, 0x0B5D, 0x0B5F, 0x0B61, 0x0B85,
112: 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95, 0x0B99, 0x0B9A,
113: 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA, 0x0BAE,
114: 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
115: 0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60,
116: 0x0C61, 0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8,
117: 0x0CAA, 0x0CB3, 0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05,
118: 0x0D0C, 0x0D0E, 0x0D10, 0x0D12, 0x0D28, 0x0D2A, 0x0D39,
119: 0x0D60, 0x0D61, 0x0E01, 0x0E2E, 0x0E32, 0x0E33, 0x0E40,
120: 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88, 0x0E94, 0x0E97,
121: 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB, 0x0EAD,
122: 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
123: 0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102,
124: 0x1103, 0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112,
125: 0x1154, 0x1155, 0x115F, 0x1161, 0x116D, 0x116E, 0x1172,
126: 0x1173, 0x11AE, 0x11AF, 0x11B7, 0x11B8, 0x11BC, 0x11C2,
127: 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9, 0x1F00, 0x1F15, 0x1F18,
128: 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D, 0x1F50, 0x1F57,
129: 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC, 0x1FC2,
130: 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
131: 0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A,
132: 0x212B, 0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA,
133: 0x3105, 0x312C, 0xAC00, 0xD7A3,
134: // Ideographic
135: 0x3021, 0x3029, 0x4E00, 0x9FA5, };
136: int letterChar[] = {
137: // BaseChar
138: 0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559,
139: 0x06D5, 0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0,
140: 0x0B3D, 0x0B9C, 0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D,
141: 0x0EA5, 0x0EA7, 0x0EB0, 0x0EBD, 0x1100, 0x1109, 0x113C,
142: 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, 0x1159, 0x1163,
143: 0x1165, 0x1167, 0x1169, 0x1175, 0x119E, 0x11A8, 0x11AB,
144: 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B, 0x1F5D,
145: 0x1FBE, 0x2126, 0x212E,
146: // Ideographic
147: 0x3007, };
148:
149: //
150: // [87] CombiningChar ::= ...
151: //
152:
153: int combiningCharRange[] = { 0x0300, 0x0345, 0x0360, 0x0361,
154: 0x0483, 0x0486, 0x0591, 0x05A1, 0x05A3, 0x05B9, 0x05BB,
155: 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652, 0x06D6, 0x06DC,
156: 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8, 0x06EA,
157: 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
158: 0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7,
159: 0x09C8, 0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42,
160: 0x0A47, 0x0A48, 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81,
161: 0x0A83, 0x0ABE, 0x0AC5, 0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD,
162: 0x0B01, 0x0B03, 0x0B3E, 0x0B43, 0x0B47, 0x0B48, 0x0B4B,
163: 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83, 0x0BBE, 0x0BC2,
164: 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03, 0x0C3E,
165: 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
166: 0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA,
167: 0x0CCD, 0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43,
168: 0x0D46, 0x0D48, 0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47,
169: 0x0E4E, 0x0EB4, 0x0EB9, 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD,
170: 0x0F18, 0x0F19, 0x0F71, 0x0F84, 0x0F86, 0x0F8B, 0x0F90,
171: 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7, 0x20D0, 0x20DC,
172: 0x302A, 0x302F, };
173:
174: int combiningCharChar[] = { 0x05BF, 0x05C4, 0x0670, 0x093C,
175: 0x094D, 0x09BC, 0x09BE, 0x09BF, 0x09D7, 0x0A02, 0x0A3C,
176: 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7, 0x0D57, 0x0E31,
177: 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, 0x0F97,
178: 0x0FB9, 0x20E1, 0x3099, 0x309A, };
179:
180: //
181: // [88] Digit ::= ...
182: //
183:
184: int digitRange[] = { 0x0030, 0x0039, 0x0660, 0x0669, 0x06F0,
185: 0x06F9, 0x0966, 0x096F, 0x09E6, 0x09EF, 0x0A66, 0x0A6F,
186: 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F, 0x0BE7, 0x0BEF, 0x0C66,
187: 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F, 0x0E50, 0x0E59,
188: 0x0ED0, 0x0ED9, 0x0F20, 0x0F29, };
189:
190: //
191: // [89] Extender ::= ...
192: //
193:
194: int extenderRange[] = { 0x3031, 0x3035, 0x309D, 0x309E, 0x30FC,
195: 0x30FE, };
196:
197: int extenderChar[] = { 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640,
198: 0x0E46, 0x0EC6, 0x3005, };
199:
200: //
201: // Initialize
202: //
203:
204: // set name start characters
205: for (int i = 0; i < nameStartChar.length; i++) {
206: CHARS[nameStartChar[i]] |= MASK_NAME_START | MASK_NAME;
207: }
208: for (int i = 0; i < letterRange.length; i += 2) {
209: for (int j = letterRange[i]; j <= letterRange[i + 1]; j++) {
210: CHARS[j] |= MASK_NAME_START | MASK_NAME;
211: }
212: }
213: for (int i = 0; i < letterChar.length; i++) {
214: CHARS[letterChar[i]] |= MASK_NAME_START | MASK_NAME;
215: }
216:
217: // set name characters
218: for (int i = 0; i < nameChar.length; i++) {
219: CHARS[nameChar[i]] |= MASK_NAME;
220: }
221: for (int i = 0; i < digitRange.length; i += 2) {
222: for (int j = digitRange[i]; j <= digitRange[i + 1]; j++) {
223: CHARS[j] |= MASK_NAME;
224: }
225: }
226: for (int i = 0; i < combiningCharRange.length; i += 2) {
227: for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++) {
228: CHARS[j] |= MASK_NAME;
229: }
230: }
231: for (int i = 0; i < combiningCharChar.length; i++) {
232: CHARS[combiningCharChar[i]] |= MASK_NAME;
233: }
234: for (int i = 0; i < extenderRange.length; i += 2) {
235: for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++) {
236: CHARS[j] |= MASK_NAME;
237: }
238: }
239: for (int i = 0; i < extenderChar.length; i++) {
240: CHARS[extenderChar[i]] |= MASK_NAME;
241: }
242:
243: }
244:
245: // Constructor
246: //-------------------------------------------------------------------------
247:
248: /**
249: * <p>Constructor for use by tools that required <code>JavaBean</code> instances.</p>
250: *
251: * <p>This constructor is public <strong>only</strong>
252: * to permit tools that require a JavaBean instance to operate.
253: * <code>XMLUtils</code> instances should <strong>not</strong> be constructed in standard
254: * programming. Instead, the class methods should be called directly.</p>
255: */
256: public XMLUtils() {
257: }
258:
259: // Class methods
260: //-------------------------------------------------------------------------
261:
262: /**
263: * <p>Escape the <code>toString</code> of the given object.
264: * For use as body text.</p>
265: *
266: * @param value escape <code>value.toString()</code>
267: * @return text with escaped delimiters
268: */
269: public static final String escapeBodyValue(Object value) {
270: StringBuffer buffer = new StringBuffer(value.toString());
271: for (int i = 0, size = buffer.length(); i < size; i++) {
272: switch (buffer.charAt(i)) {
273: case '<':
274: buffer.replace(i, i + 1, LESS_THAN_ENTITY);
275: size += 3;
276: i += 3;
277: break;
278: case '>':
279: buffer.replace(i, i + 1, GREATER_THAN_ENTITY);
280: size += 3;
281: i += 3;
282: break;
283: case '&':
284: buffer.replace(i, i + 1, AMPERSAND_ENTITY);
285: size += 4;
286: i += 4;
287: break;
288: }
289: }
290: return buffer.toString();
291: }
292:
293: /**
294: * <p>Escape the <code>toString</code> of the given object.
295: * For use in an attribute value.</p>
296: *
297: * @param value escape <code>value.toString()</code>
298: * @return text with characters restricted (for use in attributes) escaped
299: */
300: public static final String escapeAttributeValue(Object value) {
301: StringBuffer buffer = new StringBuffer(value.toString());
302: for (int i = 0, size = buffer.length(); i < size; i++) {
303: switch (buffer.charAt(i)) {
304: case '<':
305: buffer.replace(i, i + 1, LESS_THAN_ENTITY);
306: size += 3;
307: i += 3;
308: break;
309: case '>':
310: buffer.replace(i, i + 1, GREATER_THAN_ENTITY);
311: size += 3;
312: i += 3;
313: break;
314: case '&':
315: buffer.replace(i, i + 1, AMPERSAND_ENTITY);
316: size += 4;
317: i += 4;
318: break;
319: case '\'':
320: buffer.replace(i, i + 1, APOSTROPHE_ENTITY);
321: size += 5;
322: i += 5;
323: break;
324: case '\"':
325: buffer.replace(i, i + 1, QUOTE_ENTITY);
326: size += 5;
327: i += 5;
328: break;
329: }
330: }
331: return buffer.toString();
332: }
333:
334: /**
335: * Escapes the given content suitable for insertion within a
336: * <code>CDATA</code> sequence.
337: * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
338: * string ']]>' is recognized as markup.
339: * @param content the body content whose character data should
340: * be escaped in a way appropriate for use within a <code>CDATA</code>
341: * section of xml.
342: * @return escaped character data, not null
343: */
344: public static final String escapeCDATAContent(String content) {
345: StringBuffer buffer = new StringBuffer(content);
346: escapeCDATAContent(buffer);
347: return buffer.toString();
348: }
349:
350: /**
351: * Escapes the given content suitable for insertion within a
352: * <code>CDATA</code> sequence.
353: * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
354: * string ']]>' is recognized as markup.
355: * @param bufferedContent the body content within a buffer
356: * whose character data should
357: * be escaped in a way appropriate for use within a <code>CDATA</code>
358: * section of xml
359: */
360: public static final void escapeCDATAContent(
361: StringBuffer bufferedContent) {
362: for (int i = 2, size = bufferedContent.length(); i < size; i++) {
363: char at = bufferedContent.charAt(i);
364: if (at == '>' && bufferedContent.charAt(i - 1) == ']'
365: && bufferedContent.charAt(i - 2) == ']') {
366:
367: bufferedContent.replace(i, i + 1, GREATER_THAN_ENTITY);
368: size += 3;
369: i += 3;
370: }
371: }
372: }
373:
374: /**
375: * <p>Is this string a well formed xml name?</p>
376: *
377: * <p>Only certain characters are allowed in well formed element and attribute
378: * names in xml. For example, white space is not allowed in a name.</p>
379: *
380: * <p>The code for this method is based on code in
381: * <code>org.apache.xerces.util.XMLChar</code>
382: * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
383: * The authors of this class are credited at the top of this class.</p>
384: *
385: * @param name the <code>String</code> to be checked for use as an xml attribute
386: * or element name. Returns false if <code>name</code> is null
387: * @return true if this string would be a well-formed name
388: */
389: public static boolean isWellFormedXMLName(String name) {
390: if (name == null) {
391: return false;
392: }
393:
394: if (name.length() == 0) {
395: return false;
396: }
397:
398: char ch = name.charAt(0);
399: if (isNameStartChar(ch) == false) {
400: return false;
401:
402: }
403:
404: for (int i = 1; i < name.length(); i++) {
405: ch = name.charAt(i);
406: if (isNameChar(ch) == false) {
407: return false;
408: }
409: }
410: return true;
411: }
412:
413: /**
414: * Returns true if the specified character is a valid name
415: * character as defined by the XML 1.0 specification.
416: *
417: * @param c The character to check.
418: * @return true if this is an XML name character
419: */
420: public static boolean isNameChar(int c) {
421: return c < 0x10000 && (CHARS[c] & MASK_NAME) != 0;
422: }
423:
424: /**
425: * Returns true if the specified character is a valid name start
426: * character as defined in the XML 1.0 specification.
427: *
428: * @param c The character to check.
429: * @return trus if this is an XML name start character
430: */
431: public static boolean isNameStartChar(int c) {
432: return c < 0x10000 && (CHARS[c] & MASK_NAME_START) != 0;
433: }
434: }
|