001: /*
002: * Copyright 1999-2004 The Apache Software Foundation
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.tomcat.util.buf;
018:
019: import java.io.CharArrayWriter;
020: import java.io.IOException;
021: import java.io.Writer;
022: import java.util.BitSet;
023:
024: /** Efficient implementation for encoders.
025: * This class is not thread safe - you need one encoder per thread.
026: * The encoder will save and recycle the internal objects, avoiding
027: * garbage.
028: *
029: * You can add extra characters that you want preserved, for example
030: * while encoding a URL you can add "/".
031: *
032: * @author Costin Manolache
033: */
034: public final class UEncoder {
035:
036: // Not static - the set may differ ( it's better than adding
037: // an extra check for "/", "+", etc
038: private BitSet safeChars = null;
039: private C2BConverter c2b = null;
040: private ByteChunk bb = null;
041:
042: private String encoding = "UTF8";
043: private static final int debug = 0;
044:
045: public UEncoder() {
046: initSafeChars();
047: }
048:
049: public void setEncoding(String s) {
050: encoding = s;
051: }
052:
053: public void addSafeCharacter(char c) {
054: safeChars.set(c);
055: }
056:
057: /** URL Encode string, using a specified encoding.
058: * @param s string to be encoded
059: * @param enc character encoding, for chars >%80 ( use UTF8 if not set,
060: * as recommended in RFCs)
061: * @param reserved extra characters to preserve ( "/" - if s is a URL )
062: */
063: public void urlEncode(Writer buf, String s) throws IOException {
064: if (c2b == null) {
065: bb = new ByteChunk(16); // small enough.
066: c2b = new C2BConverter(bb, encoding);
067: }
068:
069: for (int i = 0; i < s.length(); i++) {
070: int c = (int) s.charAt(i);
071: if (safeChars.get(c)) {
072: if (debug > 0)
073: log("Safe: " + (char) c);
074: buf.write((char) c);
075: } else {
076: if (debug > 0)
077: log("Unsafe: " + (char) c);
078: c2b.convert((char) c);
079:
080: // "surrogate" - UTF is _not_ 16 bit, but 21 !!!!
081: // ( while UCS is 31 ). Amazing...
082: if (c >= 0xD800 && c <= 0xDBFF) {
083: if ((i + 1) < s.length()) {
084: int d = (int) s.charAt(i + 1);
085: if (d >= 0xDC00 && d <= 0xDFFF) {
086: if (debug > 0)
087: log("Unsafe: " + c);
088: c2b.convert((char) d);
089: i++;
090: }
091: }
092: }
093:
094: c2b.flushBuffer();
095:
096: urlEncode(buf, bb.getBuffer(), bb.getOffset(), bb
097: .getLength());
098: bb.recycle();
099: }
100: }
101: }
102:
103: /**
104: */
105: public void urlEncode(Writer buf, byte bytes[], int off, int len)
106: throws IOException {
107: for (int j = off; j < len; j++) {
108: buf.write('%');
109: char ch = Character.forDigit((bytes[j] >> 4) & 0xF, 16);
110: if (debug > 0)
111: log("Encode: " + ch);
112: buf.write(ch);
113: ch = Character.forDigit(bytes[j] & 0xF, 16);
114: if (debug > 0)
115: log("Encode: " + ch);
116: buf.write(ch);
117: }
118: }
119:
120: /**
121: * Utility funtion to re-encode the URL.
122: * Still has problems with charset, since UEncoder mostly
123: * ignores it.
124: */
125: public String encodeURL(String uri) {
126: String outUri = null;
127: try {
128: // XXX optimize - recycle, etc
129: CharArrayWriter out = new CharArrayWriter();
130: urlEncode(out, uri);
131: outUri = out.toString();
132: } catch (IOException iex) {
133: }
134: return outUri;
135: }
136:
137: // -------------------- Internal implementation --------------------
138:
139: //
140: private void init() {
141:
142: }
143:
144: private void initSafeChars() {
145: safeChars = new BitSet(128);
146: int i;
147: for (i = 'a'; i <= 'z'; i++) {
148: safeChars.set(i);
149: }
150: for (i = 'A'; i <= 'Z'; i++) {
151: safeChars.set(i);
152: }
153: for (i = '0'; i <= '9'; i++) {
154: safeChars.set(i);
155: }
156: //safe
157: safeChars.set('$');
158: safeChars.set('-');
159: safeChars.set('_');
160: safeChars.set('.');
161:
162: // Dangerous: someone may treat this as " "
163: // RFC1738 does allow it, it's not reserved
164: // safeChars.set('+');
165: //extra
166: safeChars.set('!');
167: safeChars.set('*');
168: safeChars.set('\'');
169: safeChars.set('(');
170: safeChars.set(')');
171: safeChars.set(',');
172: }
173:
174: private static void log(String s) {
175: System.out.println("Encoder: " + s);
176: }
177: }
|