001: /*
002: * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
003: * PROPRIETARY/CONFIDENTIAL. Use of this product is subject to license terms.
004: */
005:
006: /*
007: * backport of j2se 1.4 ULREncoder.java 1.25 01/12/03
008: *
009: * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
010: * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
011: */
012:
013: package com.sun.portal.search.util;
014:
015: import java.io.ByteArrayOutputStream;
016: import java.io.BufferedWriter;
017: import java.io.OutputStreamWriter;
018: import java.io.IOException;
019: import java.io.UnsupportedEncodingException;
020: import java.util.BitSet;
021: import java.security.AccessController;
022: import java.security.PrivilegedAction;
023:
024: //import sun.security.action.GetBooleanAction;
025: //import sun.security.action.GetPropertyAction;
026:
027: /**
028: * Utility class for HTML form encoding. This class contains static methods
029: * for converting a String to the <CODE>application/x-www-form-urlencoded</CODE> MIME
030: * format. For more information about HTML form encoding, consult the HTML
031: * <A HREF="http://www.w3.org/TR/html4/">specification</A>.
032: *
033: * <p>
034: * When encoding a String, the following rules apply:
035: *
036: * <p>
037: * <ul>
038: * <li>The alphanumeric characters "<code>a</code>" through
039: * "<code>z</code>", "<code>A</code>" through
040: * "<code>Z</code>" and "<code>0</code>"
041: * through "<code>9</code>" remain the same.
042: * <li>The special characters "<code>.</code>",
043: * "<code>-</code>", "<code>*</code>", and
044: * "<code>_</code>" remain the same.
045: * <li>The space character "<code> </code>" is
046: * converted into a plus sign "<code>+</code>".
047: * <li>All other characters are unsafe and are first converted into
048: * one or more bytes using some encoding scheme. Then each byte is
049: * represented by the 3-character string
050: * "<code>%<i>xy</i></code>", where <i>xy</i> is the
051: * two-digit hexadecimal representation of the byte.
052: * The recommended encoding scheme to use is UTF-8. However,
053: * for compatibility reasons, if an encoding is not specified,
054: * then the default encoding of the platform is used.
055: * </ul>
056: *
057: * <p>
058: * For example using UTF-8 as the encoding scheme the string "The
059: * string ü@foo-bar" would get converted to
060: * "The+string+%C3%BC%40foo-bar" because in UTF-8 the character
061: * ü is encoded as two bytes C3 (hex) and BC (hex), and the
062: * character @ is encoded as one byte 40 (hex).
063: *
064: * @author Herb Jellinek
065: * @version 1.25, 12/03/01
066: * @since JDK1.0
067: */
068: public class Encoder {
069: static BitSet dontNeedEncoding;
070: static final int caseDiff = ('a' - 'A');
071: static String dfltEncName = null;
072:
073: static {
074:
075: /* The list of characters that are not encoded has been
076: * determined as follows:
077: *
078: * RFC 2396 states:
079: * -----
080: * Data characters that are allowed in a URI but do not have a
081: * reserved purpose are called unreserved. These include upper
082: * and lower case letters, decimal digits, and a limited set of
083: * punctuation marks and symbols.
084: *
085: * unreserved = alphanum | mark
086: *
087: * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
088: *
089: * Unreserved characters can be escaped without changing the
090: * semantics of the URI, but this should not be done unless the
091: * URI is being used in a context that does not allow the
092: * unescaped character to appear.
093: * -----
094: *
095: * It appears that both Netscape and Internet Explorer escape
096: * all special characters from this list with the exception
097: * of "-", "_", ".", "*". While it is not clear why they are
098: * escaping the other characters, perhaps it is safest to
099: * assume that there might be contexts in which the others
100: * are unsafe if not escaped. Therefore, we will use the same
101: * list. It is also noteworthy that this is consistent with
102: * O'Reilly's "HTML: The Definitive Guide" (page 164).
103: *
104: * As a last note, Intenet Explorer does not encode the "@"
105: * character which is clearly not unreserved according to the
106: * RFC. We are being consistent with the RFC in this matter,
107: * as is Netscape.
108: *
109: */
110:
111: dontNeedEncoding = new BitSet(256);
112: int i;
113: for (i = 'a'; i <= 'z'; i++) {
114: dontNeedEncoding.set(i);
115: }
116: for (i = 'A'; i <= 'Z'; i++) {
117: dontNeedEncoding.set(i);
118: }
119: for (i = '0'; i <= '9'; i++) {
120: dontNeedEncoding.set(i);
121: }
122: dontNeedEncoding.set(' '); /* encoding a space to a + is done
123: * in the encode() method */
124: dontNeedEncoding.set('-');
125: dontNeedEncoding.set('_');
126: dontNeedEncoding.set('.');
127: dontNeedEncoding.set('*');
128:
129: /*dfltEncName = (String)AccessController.doPrivileged (
130: new GetPropertyAction("file.encoding")
131: );*/
132: dfltEncName = "ISO-8859-1";
133: }
134:
135: /**
136: * You can't call the constructor.
137: */
138: private Encoder() {
139: }
140:
141: /**
142: * Translates a string into <code>x-www-form-urlencoded</code>
143: * format. This method uses the platform's default encoding
144: * as the encoding scheme to obtain the bytes for unsafe characters.
145: *
146: * @param s <code>String</code> to be translated.
147: * @deprecated The resulting string may vary depending on the platform's
148: * default encoding. Instead, use the encode(String,String)
149: * method to specify the encoding.
150: * @return the translated <code>String</code>.
151: */
152: public static String urlEncode(String s) {
153:
154: String str = null;
155:
156: try {
157: str = urlEncode(s, dfltEncName);
158: } catch (UnsupportedEncodingException e) {
159: // The system should always have the platform default
160: }
161:
162: return str;
163: }
164:
165: /**
166: * Rewrite of insanely inefficient j2se1.4 URL Encoder...
167: * NB: j2se 1.3 URLEncoder can't handle certain chars, eg, \u8c9d
168: *
169: * Translates a string into <code>application/x-www-form-urlencoded</code>
170: * format using a specific encoding scheme. This method uses the
171: * supplied encoding scheme to obtain the bytes for unsafe
172: * characters.
173: * <p>
174: * <em><strong>Note:</strong> The <a href=
175: * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars">
176: * World Wide Web Consortium Recommendation</a> states that
177: * UTF-8 should be used. Not doing so may introduce
178: * incompatibilites.</em>
179: * <p>
180: *
181: * @param s <code>String</code> to be translated.
182: * @param enc The name of a supported
183: * <a href="../lang/package-summary.html#charenc">character
184: * encoding</a>.
185: * @return the translated <code>String</code>.
186: * @exception UnsupportedEncodingException
187: * If the named encoding is not supported
188: * @see URLDecoder#decode(java.lang.String, java.lang.String)
189: */
190: public static String urlEncode(String s, String enc)
191: throws UnsupportedEncodingException {
192:
193: boolean needToChange = false;
194: int encBytesPerChar = 10; // should virtually eliminate any buffer regrowth
195: StringBuffer out = new StringBuffer(encBytesPerChar
196: * s.length());
197:
198: // Conversion to external encoding
199: //System.out.println("enc: " + s + " -> " + result);
200: if (enc != null)
201: s = new String(s.getBytes(enc), "ISO-8859-1");
202:
203: for (int i = 0; i < s.length(); i++) {
204: int c = (int) s.charAt(i);
205: if (dontNeedEncoding.get(c)) {
206: if (c == ' ') {
207: c = '+';
208: }
209: //System.out.println("Storing: " + c);
210: out.append((char) c);
211: continue;
212: }
213: out.append('%');
214: char ch = Character.forDigit((c >> 4) & 0xF, 16);
215: // converting to use uppercase letter as part of
216: // the hex value if ch is a letter.
217: if (Character.isLetter(ch)) {
218: ch -= caseDiff;
219: }
220: out.append(ch);
221: ch = Character.forDigit(c & 0xF, 16);
222: if (Character.isLetter(ch)) {
223: ch -= caseDiff;
224: }
225: out.append(ch);
226: }
227:
228: return (out.toString());
229: }
230:
231: /**
232: * To escape and quote a String. Escapes \n, \r, \ and " chars.
233: * @param s the string to escape
234: * @return a string that can be safely placed in double quotes
235: * to build a Java String, Nova query, etc
236: */
237: static public String quotedEscape(String s) {
238: return quotedEscape(s, false);
239: }
240:
241: /**
242: * To escape and quote a String. Escapes \n, \r, \ and " chars.
243: * @param s the string to escape
244: * @param quoteResult whether to wrap the result string in double quotes
245: * @return a string that can be safely placed in double quotes
246: * to build a Java String, Nova query, etc
247: */
248: static public String quotedEscape(String s, boolean quoteResult) {
249: StringBuffer sb = new StringBuffer(s.length() + 20);
250: if (quoteResult)
251: sb.append("\"");
252: for (int i = 0; i < s.length(); ++i) {
253: char c = s.charAt(i);
254: if (c == '\n') {
255: sb.append("\\n");
256: continue;
257: }
258: if (c == '\r') {
259: sb.append("\\r");
260: continue;
261: }
262: if (c == '\\' || c == '"')
263: sb.append('\\');
264: sb.append(c);
265: }
266: if (quoteResult)
267: sb.append("\"");
268: return sb.toString();
269: }
270:
271: /**
272: * Encodes <>"'& characters
273: * @param s string to be html encoded
274: * @param sb string buffer to hold results
275: */
276: public static StringBuffer htmlEncode(String s, StringBuffer sb) {
277: for (int i = 0; i < s.length(); i++) {
278: char c = s.charAt(i);
279: switch (c) {
280: case '<':
281: sb.append("<");
282: break;
283: case '>':
284: sb.append(">");
285: break;
286: case '&':
287: sb.append("&");
288: break;
289: case '"':
290: sb.append(""");
291: break;
292: case '\'':
293: //sb.append("'"); // only understood by Mozilla
294: sb.append("'");
295: break;
296: default:
297: sb.append(c);
298: break;
299: }
300: }
301: return sb;
302: }
303:
304: /**
305: * Encodes <>"'& characters
306: * @param s string to be html encoded
307: */
308: public static String htmlEncode(String s) {
309: StringBuffer buf = new StringBuffer();
310: return htmlEncode(s, buf).toString();
311: }
312:
313: /**
314: * Converts a character to a unicode escape sequence.
315: */
316: public static final String unicodeEscape(char c) {
317: StringBuffer b = new StringBuffer();
318: if (c >= 30 && c <= 126)
319: return "" + c;
320: b.append("\\u");
321: String u = Integer.toHexString(c);
322: switch (u.length()) {
323: case 1:
324: u = "000" + u;
325: break;
326: case 2:
327: u = "00" + u;
328: break;
329: case 3:
330: u = "0" + u;
331: break;
332: }
333: b.append(u);
334: return b.toString();
335: }
336:
337: /**
338: * Converts a String to a unicode escaped String.
339: */
340: public static final String unicodeEscape(String s) {
341: StringBuffer b = new StringBuffer();
342: for (int i = 0; i < s.length(); ++i) {
343: b.append(unicodeEscape(s.charAt(i)));
344: }
345: return b.toString();
346: }
347:
348: }
|