001: /*
002: * @(#)URLEncoder.java 1.25 06/10/10
003: *
004: * Copyright 1990-2006 Sun Microsystems, Inc. All Rights Reserved.
005: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License version
009: * 2 only, as published by the Free Software Foundation.
010: *
011: * This program is distributed in the hope that it will be useful, but
012: * WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014: * General Public License version 2 for more details (a copy is
015: * included at /legal/license.txt).
016: *
017: * You should have received a copy of the GNU General Public License
018: * version 2 along with this work; if not, write to the Free Software
019: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020: * 02110-1301 USA
021: *
022: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
023: * Clara, CA 95054 or visit www.sun.com if you need additional
024: * information or have any questions.
025: *
026: */
027:
028: package java.net;
029:
030: import java.io.ByteArrayOutputStream;
031: import java.io.BufferedWriter;
032: import java.io.OutputStreamWriter;
033: import java.io.IOException;
034: import java.io.UnsupportedEncodingException;
035: import java.util.BitSet;
036: import java.security.AccessController;
037: import java.security.PrivilegedAction;
038: import sun.security.action.GetBooleanAction;
039: import sun.security.action.GetPropertyAction;
040:
041: /**
042: * Utility class for HTML form encoding. This class contains static methods
043: * for converting a String to the <CODE>application/x-www-form-urlencoded</CODE> MIME
044: * format. For more information about HTML form encoding, consult the HTML
045: * <A HREF="http://www.w3.org/TR/html4/">specification</A>.
046: *
047: * <p>
048: * When encoding a String, the following rules apply:
049: *
050: * <p>
051: * <ul>
052: * <li>The alphanumeric characters "<code>a</code>" through
053: * "<code>z</code>", "<code>A</code>" through
054: * "<code>Z</code>" and "<code>0</code>"
055: * through "<code>9</code>" remain the same.
056: * <li>The special characters "<code>.</code>",
057: * "<code>-</code>", "<code>*</code>", and
058: * "<code>_</code>" remain the same.
059: * <li>The space character "<code> </code>" is
060: * converted into a plus sign "<code>+</code>".
061: * <li>All other characters are unsafe and are first converted into
062: * one or more bytes using some encoding scheme. Then each byte is
063: * represented by the 3-character string
064: * "<code>%<i>xy</i></code>", where <i>xy</i> is the
065: * two-digit hexadecimal representation of the byte.
066: * The recommended encoding scheme to use is UTF-8. However,
067: * for compatibility reasons, if an encoding is not specified,
068: * then the default encoding of the platform is used.
069: * </ul>
070: *
071: * <p>
072: * For example using UTF-8 as the encoding scheme the string "The
073: * string ü@foo-bar" would get converted to
074: * "The+string+%C3%BC%40foo-bar" because in UTF-8 the character
075: * ü is encoded as two bytes C3 (hex) and BC (hex), and the
076: * character @ is encoded as one byte 40 (hex).
077: *
078: * @author Herb Jellinek
079: * @version 1.18, 02/02/00
080: * @since JDK1.0
081: */
082: public class URLEncoder {
083: static BitSet dontNeedEncoding;
084: static final int caseDiff = ('a' - 'A');
085: static String dfltEncName = null;
086:
087: static {
088:
089: /* The list of characters that are not encoded has been
090: * determined as follows:
091: *
092: * RFC 2396 states:
093: * -----
094: * Data characters that are allowed in a URI but do not have a
095: * reserved purpose are called unreserved. These include upper
096: * and lower case letters, decimal digits, and a limited set of
097: * punctuation marks and symbols.
098: *
099: * unreserved = alphanum | mark
100: *
101: * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
102: *
103: * Unreserved characters can be escaped without changing the
104: * semantics of the URI, but this should not be done unless the
105: * URI is being used in a context that does not allow the
106: * unescaped character to appear.
107: * -----
108: *
109: * It appears that both Netscape and Internet Explorer escape
110: * all special characters from this list with the exception
111: * of "-", "_", ".", "*". While it is not clear why they are
112: * escaping the other characters, perhaps it is safest to
113: * assume that there might be contexts in which the others
114: * are unsafe if not escaped. Therefore, we will use the same
115: * list. It is also noteworthy that this is consistent with
116: * O'Reilly's "HTML: The Definitive Guide" (page 164).
117: *
118: * As a last note, Intenet Explorer does not encode the "@"
119: * character which is clearly not unreserved according to the
120: * RFC. We are being consistent with the RFC in this matter,
121: * as is Netscape.
122: *
123: */
124:
125: dontNeedEncoding = new BitSet(256);
126: int i;
127: for (i = 'a'; i <= 'z'; i++) {
128: dontNeedEncoding.set(i);
129: }
130: for (i = 'A'; i <= 'Z'; i++) {
131: dontNeedEncoding.set(i);
132: }
133: for (i = '0'; i <= '9'; i++) {
134: dontNeedEncoding.set(i);
135: }
136: dontNeedEncoding.set(' '); /* encoding a space to a + is done
137: * in the encode() method */
138: dontNeedEncoding.set('-');
139: dontNeedEncoding.set('_');
140: dontNeedEncoding.set('.');
141: dontNeedEncoding.set('*');
142:
143: dfltEncName = (String) AccessController
144: .doPrivileged(new GetPropertyAction("file.encoding"));
145: }
146:
147: /**
148: * You can't call the constructor.
149: */
150: private URLEncoder() {
151: }
152:
153: /**
154: * Translates a string into <code>x-www-form-urlencoded</code>
155: * format. This method uses the platform's default encoding
156: * as the encoding scheme to obtain the bytes for unsafe characters.
157: *
158: * @param s <code>String</code> to be translated.
159: * @deprecated The resulting string may vary depending on the platform's
160: * default encoding. Instead, use the encode(String,String)
161: * method to specify the encoding.
162: * @return the translated <code>String</code>.
163: */
164: public static String encode(String s) {
165:
166: String str = null;
167:
168: try {
169: str = encode(s, dfltEncName);
170: } catch (UnsupportedEncodingException e) {
171: // The system should always have the platform default
172: }
173:
174: return str;
175: }
176:
177: /**
178: * Translates a string into <code>application/x-www-form-urlencoded</code>
179: * format using a specific encoding scheme. This method uses the
180: * supplied encoding scheme to obtain the bytes for unsafe
181: * characters.
182: * <p>
183: * <em><strong>Note:</strong> The <a href=
184: * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars">
185: * World Wide Web Consortium Recommendation</a> states that
186: * UTF-8 should be used. Not doing so may introduce
187: * incompatibilites.</em>
188: *
189: * @param s <code>String</code> to be translated.
190: * @param enc The name of a supported
191: * <a href="../lang/package-summary.html#charenc">character
192: * encoding</a>.
193: * @return the translated <code>String</code>.
194: * @exception UnsupportedEncodingException
195: * If the named encoding is not supported
196: * @see URLDecoder#decode(java.lang.String, java.lang.String)
197: * @since 1.4
198: */
199: public static String encode(String s, String enc)
200: throws UnsupportedEncodingException {
201:
202: boolean needToChange = false;
203: boolean wroteUnencodedChar = false;
204: int maxBytesPerChar = 10; // rather arbitrary limit, but safe for now
205: StringBuffer out = new StringBuffer(s.length());
206: ByteArrayOutputStream buf = new ByteArrayOutputStream(
207: maxBytesPerChar);
208:
209: OutputStreamWriter writer = new OutputStreamWriter(buf, enc);
210:
211: for (int i = 0; i < s.length(); i++) {
212: int c = (int) s.charAt(i);
213: //System.out.println("Examining character: " + c);
214: if (dontNeedEncoding.get(c)) {
215: if (c == ' ') {
216: c = '+';
217: needToChange = true;
218: }
219: //System.out.println("Storing: " + c);
220: out.append((char) c);
221: wroteUnencodedChar = true;
222: } else {
223: // convert to external encoding before hex conversion
224: try {
225: if (wroteUnencodedChar) { // Fix for 4407610
226: writer = new OutputStreamWriter(buf, enc);
227: wroteUnencodedChar = false;
228: }
229: writer.write(c);
230: /*
231: * If this character represents the start of a Unicode
232: * surrogate pair, then pass in two characters. It's not
233: * clear what should be done if a bytes reserved in the
234: * surrogate pairs range occurs outside of a legal
235: * surrogate pair. For now, just treat it as if it were
236: * any other character.
237: */
238: if (c >= 0xD800 && c <= 0xDBFF) {
239: /*
240: System.out.println(Integer.toHexString(c)
241: + " is high surrogate");
242: */
243: if ((i + 1) < s.length()) {
244: int d = (int) s.charAt(i + 1);
245: /*
246: System.out.println("\tExamining "
247: + Integer.toHexString(d));
248: */
249: if (d >= 0xDC00 && d <= 0xDFFF) {
250: /*
251: System.out.println("\t"
252: + Integer.toHexString(d)
253: + " is low surrogate");
254: */
255: writer.write(d);
256: i++;
257: }
258: }
259: }
260: writer.flush();
261: } catch (IOException e) {
262: buf.reset();
263: continue;
264: }
265: byte[] ba = buf.toByteArray();
266: for (int j = 0; j < ba.length; j++) {
267: out.append('%');
268: char ch = Character
269: .forDigit((ba[j] >> 4) & 0xF, 16);
270: // converting to use uppercase letter as part of
271: // the hex value if ch is a letter.
272: if (Character.isLetter(ch)) {
273: ch -= caseDiff;
274: }
275: out.append(ch);
276: ch = Character.forDigit(ba[j] & 0xF, 16);
277: if (Character.isLetter(ch)) {
278: ch -= caseDiff;
279: }
280: out.append(ch);
281: }
282: buf.reset();
283: needToChange = true;
284: }
285: }
286:
287: return (needToChange ? out.toString() : s);
288: }
289: }
|