001: /*
002: * @(#)XmlUtil.java
003: *
004: * Copyright (C) 2001,,2003 2002 Matt Albrecht
005: * groboclown@users.sourceforge.net
006: * http://groboutils.sourceforge.net
007: *
008: * Permission is hereby granted, free of charge, to any person obtaining a
009: * copy of this software and associated documentation files (the "Software"),
010: * to deal in the Software without restriction, including without limitation
011: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
012: * and/or sell copies of the Software, and to permit persons to whom the
013: * Software is furnished to do so, subject to the following conditions:
014: *
015: * The above copyright notice and this permission notice shall be included in
016: * all copies or substantial portions of the Software.
017: *
018: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
019: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
020: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
021: * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
022: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
023: * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
024: * DEALINGS IN THE SOFTWARE.
025: */
026:
027: package net.sourceforge.groboutils.util.xml.v1;
028:
029: /**
030: * A Utility to aid in various XML activities.
031: *
032: * @author Matt Albrecht <a href="mailto:groboclown@users.sourceforge.net">groboclown@users.sourceforge.net</a>
033: * @since May 21, 2001
034: * @version $Date: 2003/11/23 21:28:47 $
035: */
036: public class XMLUtil {
037: protected static XMLUtil s_instance = new XMLUtil();
038:
039: // * [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] |
040: // * [#xE000-#xFFFD] | [#x10000-#x10FFFF]
041: //private static final char LOWER_RANGE_1 = 0x20;
042: //private static final char UPPER_RANGE_1 = 0xD7FF;
043: //private static final char LOWER_RANGE_2 = 0xE000;
044: //private static final char UPPER_RANGE_2 = 0xFFFD;
045: private static final char LOWER_RANGE = 0x20;
046: private static final char UPPER_RANGE = 0x7f;
047:
048: // java doesn't support this range
049: // private static final char LOWER_RANGE_3 = 0x10000;
050: // private static final char UPPER_RANGE_3 = 0x10FFFF;
051: private static final char VALID_CHAR_1 = 0x9;
052: private static final char VALID_CHAR_2 = 0xA;
053: private static final char VALID_CHAR_3 = 0xD;
054:
055: private static final char[] IN_RANGE_INVALID = { '<', '>', '"',
056: '\'', '&' };
057: //private static final String IN_RANGE_INVALID_STR =
058: // new String( IN_RANGE_INVALID );
059: private static final String IN_RANGE_VALID[] = { "<", ">",
060: """, "'", "&" };
061:
062: protected XMLUtil() {
063: // do nothing
064: }
065:
066: public static XMLUtil getInstance() {
067: return s_instance;
068: }
069:
070: //------------------------------------------
071:
072: /**
073: * Convert a standard Java String into an XML string. It transforms
074: * out-of-range characters (<, >, &, ", ', and non-standard
075: * character values) into XML formatted values. Since it does correctly
076: * escape the quote characters, this may be used for both attribute values
077: * as well as standard text.
078: *
079: * @param javaStr the Java string to be transformed into XML text. If
080: * the string is <tt>null</tt>, then <tt>null</tt> is returned.
081: * @return the XML version of <tt>javaStr</tt>.
082: * @see #utf2xml( String, StringBuffer )
083: */
084: public String utf2xml(String javaStr) {
085: if (javaStr == null) {
086: return null;
087: }
088: StringBuffer sb = new StringBuffer();
089: utf2xml(javaStr, sb);
090: return sb.toString();
091: }
092:
093: /**
094: * Convert a standard Java String into an XML string. It transforms
095: * out-of-range characters (<, >, &, ", ', and non-standard
096: * character values) into XML formatted values. Since it does correctly
097: * escape the quote characters, this may be used for both attribute values
098: * as well as standard text.
099: * <P>
100: * From <a href="http://www.w3c.org/TR/2000/REC-xml-20001006">
101: * the XML recommendation</a>:
102: * <PRE>
103: * [Definition: A parsed entity contains text, a sequence of characters,
104: * which may represent markup or character data.]
105: * [Definition: A character is an atomic unit of text as specified by
106: * ISO/IEC 10646 [ISO/IEC 10646] (see also [ISO/IEC 10646-2000]).
107: * Legal characters are tab, carriage return, line feed, and the legal
108: * characters of Unicode and ISO/IEC 10646. The versions of these standards
109: * cited in A.1 Normative References were current at the time this document
110: * was prepared. New characters may be added to these standards by
111: * amendments or new editions. Consequently, XML processors must accept
112: * any character in the range specified for Char. The use of
113: * "compatibility characters", as defined in section 6.8 of
114: * [Unicode] (see also D21 in section 3.6 of [Unicode3]), is discouraged.]
115: *
116: * Character Range
117: * [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] |
118: * [#xE000-#xFFFD] | [#x10000-#x10FFFF]
119: * // any Unicode character, excluding the surrogate blocks,
120: * FFFE, and FFFF. //
121: *
122: * The mechanism for encoding character code points into bit patterns may
123: * vary from entity to entity. All XML processors must accept the UTF-8
124: * and UTF-16 encodings of 10646; the mechanisms for signaling which of
125: * the two is in use, or for bringing other encodings into play, are
126: * discussed later, in 4.3.3 Character Encoding in Entities.
127: *
128: * ...
129: *
130: * The ampersand character (&) and the left angle bracket (<)
131: * may appear in their literal form only when used as markup delimiters, or
132: * within a comment, a processing instruction, or a CDATA section. If they
133: * are needed elsewhere, they must be escaped using either numeric
134: * character references or the strings "&amp;" and "&lt;"
135: * respectively. The right angle bracket (>) may be represented using the
136: * string "&gt;", and must, for compatibility, be escaped using
137: * "&gt;" or a character reference when it appears in the string
138: * "]]>" in content, when that string is not marking the end of a CDATA
139: * section.
140: * To allow attribute values to contain both single and double quotes, the
141: * apostrophe or single-quote character (') may be represented as
142: * "&apos;", and the double-quote character (") as "&quot;".
143: * </PRE>
144: *
145: * @param javaStr the Java string to be transformed into XML text. If
146: * it is <tt>null</tt>, then the text "null" is appended to the
147: * @param output the StringBuffer to send the transformed XML into.
148: */
149: public void utf2xml(String javaStr, StringBuffer output) {
150: if (output == null) {
151: throw new IllegalArgumentException("No null StringBuffer");
152: }
153: if (javaStr == null) {
154: // original:
155: // javaStr = "null";
156:
157: // the string "null" does not have any out-of-range characters,
158: // so to optimize...
159: output.append("null");
160: return;
161: }
162: int len = javaStr.length();
163: // Ensure that the output string buffer has enough space.
164: // The given huristic seems to work well.
165: output.ensureCapacity(output.length() + (len * 2));
166:
167: // for efficiency, directly access the array.
168: char buf[] = javaStr.toCharArray();
169: for (int pos = 0; pos < len; ++pos) {
170: char c = buf[pos];
171: // test for out-of-range for escaping using &#
172: if (
173: // * [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] |
174: // * [#xE000-#xFFFD] | [#x10000-#x10FFFF]
175: (c < LOWER_RANGE && c != VALID_CHAR_1 && c != VALID_CHAR_2 && c != VALID_CHAR_3)
176: || (c > UPPER_RANGE)) {
177: output.append("&#");
178: output.append(Integer.toString(c));
179: output.append(';');
180: } else {
181: // should we escape the character with an &XXX; ?
182: boolean notfound = true;
183: for (int p2 = IN_RANGE_INVALID.length; --p2 >= 0;) {
184: if (IN_RANGE_INVALID[p2] == c) {
185: notfound = false;
186: output.append(IN_RANGE_VALID[p2]);
187: break;
188: }
189: }
190: if (notfound) {
191: // append the character as-is
192: output.append(c);
193: }
194: }
195: }
196: }
197: }
|