001: /*
002: * $Id: URLEncoding.java,v 1.32 2007/09/18 08:45:07 agoubard Exp $
003: *
004: * Copyright 2003-2007 Orange Nederland Breedband B.V.
005: * See the COPYRIGHT file for redistribution and use restrictions.
006: */
007: package org.xins.common.text;
008:
009: import java.io.ByteArrayOutputStream;
010: import java.io.UnsupportedEncodingException;
011: import org.xins.common.MandatoryArgumentChecker;
012: import org.xins.common.Utils;
013:
014: /**
015: * URL encoding utility functions with Unicode support. This class supports
016: * both encoding and decoding. All characters higher than 127 will be encoded
017: * as %uxxxx where xxxx is the Unicode value of the character in hexadecimal.
018: *
019: * @version $Revision: 1.32 $ $Date: 2007/09/18 08:45:07 $
020: * @author <a href="mailto:ernst@ernstdehaan.com">Ernst de Haan</a>
021: * @author <a href="mailto:anthony.goubard@japplis.com">Anthony Goubard</a>
022: *
023: * @since XINS 1.0.0
024: */
025: public final class URLEncoding {
026:
027: /**
028: * The character zero (<code>'0'</code>) as an <code>int</code>.
029: */
030: private static final int CHAR_ZERO = (int) '0';
031:
032: /**
033: * The character nine (<code>'9'</code>) as an <code>int</code>.
034: */
035: private static final int CHAR_NINE = (int) '9';
036:
037: /**
038: * The character lowercase A (<code>'a'</code>) as an <code>int</code>.
039: */
040: private static final int CHAR_LOWER_A = (int) 'a';
041:
042: /**
043: * The character lowercase F (<code>'f'</code>) as an <code>int</code>.
044: */
045: private static final int CHAR_LOWER_F = (int) 'f';
046:
047: /**
048: * The character uppercase A (<code>'A'</code>) as an <code>int</code>.
049: */
050: private static final int CHAR_UPPER_A = (int) 'A';
051:
052: /**
053: * The character uppercase F (<code>'F'</code>) as an <code>int</code>.
054: */
055: private static final int CHAR_UPPER_F = (int) 'F';
056:
057: /**
058: * Mappings from unencoded (array index) to encoded values (array
059: * elements). The size of this array is 127.
060: */
061: private static final String[] UNENCODED_TO_ENCODED;
062:
063: static {
064: UNENCODED_TO_ENCODED = new String[255];
065: for (int i = 0; i < 255; i++) {
066: char c = (char) i;
067: if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
068: || (c >= '0' && c <= '9') || (c == '-')
069: || (c == '_') || (c == '.') || (c == '*')) {
070: UNENCODED_TO_ENCODED[i] = String.valueOf(c);
071: } else if (c == ' ') {
072: UNENCODED_TO_ENCODED[i] = "+";
073: } else {
074: char[] data = new char[3];
075: data[0] = '%';
076: data[1] = Character.toUpperCase(Character.forDigit(
077: (i >> 4) & 0xF, 16));
078: data[2] = Character.toUpperCase(Character.forDigit(
079: i & 0xF, 16));
080: UNENCODED_TO_ENCODED[i] = new String(data);
081: }
082: }
083: }
084:
085: /**
086: * Constructs a new <code>URLEncoding</code> object.
087: */
088: private URLEncoding() {
089: // empty
090: }
091:
092: /**
093: * URL encodes the specified character string as specified by W3C.
094: * http://www.w3.org/International/O-URL-code.html
095: *
096: * @param s
097: * the string to URL encode, not <code>null</code>.
098: *
099: * @return
100: * URL encoded version of the specified character string, never
101: * <code>null</code>.
102: *
103: * @throws IllegalArgumentException
104: * if <code>s == null</code>
105: */
106: public static String encode(String s)
107: throws IllegalArgumentException {
108:
109: // Check preconditions
110: MandatoryArgumentChecker.check("s", s);
111:
112: // Short-circuit if the string is empty
113: int length = s.length();
114: if (length < 1) {
115: return "";
116: }
117:
118: // Construct a buffer
119: StringBuffer buffer = new StringBuffer(length * 2);
120:
121: // Loop through the string and just append whatever we find
122: // in UNENCODED_TO_ENCODED or if c > 127, encode the UTF-8 value
123: // of the character (cf http://www.w3.org/International/O-URL-code.html).
124: char[] content = s.toCharArray();
125: for (int i = 0; i < length; i++) {
126: int c = (int) content[i];
127: if (c < 128) {
128: buffer.append(UNENCODED_TO_ENCODED[c]);
129: } else if (c <= 0x07FF) { // non-ASCII <= 0x7FF
130: buffer.append('%');
131: buffer.append(Integer.toHexString(0xc0 | (c >> 6)));
132: buffer.append('%');
133: buffer.append(Integer.toHexString(0x80 | (c & 0x3F)));
134: } else { // 0x7FF < c <= 0xFFFF
135: buffer.append('%');
136: buffer.append(Integer.toHexString(0xe0 | (c >> 12)));
137: buffer.append('%');
138: buffer.append(Integer
139: .toHexString(0x80 | ((c >> 6) & 0x3F)));
140: buffer.append('%');
141: buffer.append(Integer.toHexString(0x80 | (c & 0x3F)));
142: }
143: }
144:
145: return buffer.toString();
146: }
147:
148: /**
149: * Decodes the specified URL encoded character string.
150: * http://www.w3.org/International/O-URL-code.html
151: *
152: * @param s
153: * the URL encoded string to decode, not <code>null</code>.
154: *
155: * @return
156: * unencoded version of the specified URL encoded character string,
157: * never <code>null</code>.
158: *
159: * @throws IllegalArgumentException
160: * if <code>s == null</code>.
161: *
162: * @throws FormatException
163: * if any of the following conditions is true:
164: * <ul>
165: * <li><code>s.{@link String#charAt(int) charAt}(s.{@link String#length() length}() - 1)</code>
166: * (last character is a percentage sign)
167: * <li><code>s.{@link String#charAt(int) charAt}(s.{@link String#length() length}() - 2)</code>
168: * (before-last character is a percentage sign)
169: * <li><code>s.{@link String#charAt(int) charAt}(<em>n</em>) == '%'
170: * && !( {@link org.xins.common.text.HexConverter}.{@link org.xins.common.text.HexConverter#isHexDigit(char) isDigit}(s.{@link String#charAt(int) charAt}(<em>n</em> + 1))
171: * && {@link org.xins.common.text.HexConverter}.{@link org.xins.common.text.HexConverter#isHexDigit(char) isDigit}(s.{@link String#charAt(int) charAt}(<em>n</em> + 2)))</code>
172: * (percentage sign is followed by 2 characters of which at least one is not a hexadecimal digit)
173: * </ul>
174: */
175: public static String decode(String s)
176: throws IllegalArgumentException, FormatException {
177:
178: // Check preconditions
179: MandatoryArgumentChecker.check("s", s);
180:
181: // If the string is empty, return the original string
182: int length = s.length();
183: if (length == 0) {
184: return s;
185: }
186:
187: // Avoid calls to charAt() method.
188: char[] string = s.toCharArray();
189:
190: // Loop through the string
191: StringBuffer buffer = new StringBuffer(length * 2);
192: int index = 0;
193: while (index < length) {
194:
195: // Get the character
196: char c = string[index];
197: int charAsInt = (int) c;
198:
199: // Special case: Recognize plus sign as a space
200: if (c == '+') {
201: buffer.append(' ');
202:
203: // Catch encoded characters
204: } else if (c == '%') {
205: ByteArrayOutputStream baos = new ByteArrayOutputStream();
206: while (index < length && string[index] == '%') {
207: if (index >= length - 2) {
208: throw new FormatException(s,
209: "Character at position " + index
210: + " has invalid value "
211: + charAsInt + '.');
212: }
213: charAsInt = (int) string[++index];
214: int decodedValue = digit(charAsInt, s, index);
215: decodedValue *= 16;
216: charAsInt = (int) string[++index];
217: decodedValue += digit(charAsInt, s, index);
218:
219: baos.write((int) decodedValue);
220:
221: index++;
222: }
223: try {
224: buffer.append(baos.toString("UTF-8"));
225: } catch (UnsupportedEncodingException uee) {
226: Utils.logProgrammingError(uee);
227: }
228: // Back to the last position
229: index--;
230:
231: // Append the character
232: } else {
233: buffer.append(c);
234: }
235:
236: // Proceed to the next character
237: index++;
238: }
239:
240: return buffer.toString();
241: }
242:
243: /**
244: * Convert a hexadecimal digit to a number.
245: *
246: * @param charAsInt
247: * the hexadecimal digit.
248: *
249: * @param s
250: * the String from which the character has been taken.
251: *
252: * @param index
253: * the position of the character within the String.
254: *
255: * @return
256: * the converted character converted to an int.
257: *
258: * @throws FormatException
259: * if c is not a numerical digit or a letter between 'a' and 'f' or
260: * 'A' or 'F'.
261: */
262: private static int digit(int charAsInt, String s, int index)
263: throws FormatException {
264: int decodedValue;
265: if (charAsInt >= CHAR_ZERO && charAsInt <= CHAR_NINE) {
266: decodedValue = charAsInt - CHAR_ZERO;
267: } else if (charAsInt >= CHAR_LOWER_A
268: && charAsInt <= CHAR_LOWER_F) {
269: decodedValue = charAsInt - CHAR_LOWER_A + 10;
270: } else if (charAsInt >= CHAR_UPPER_A
271: && charAsInt <= CHAR_UPPER_F) {
272: decodedValue = charAsInt - CHAR_UPPER_A + 10;
273: } else {
274: throw new FormatException(s, "Character at position "
275: + index + " is not a hex digit. Value is "
276: + charAsInt + '.');
277: }
278: return decodedValue;
279: }
280: }
|