001: /*
002:
003: This software is OSI Certified Open Source Software.
004: OSI Certified is a certification mark of the Open Source Initiative.
005:
006: The license (Mozilla version 1.0) can be read at the MMBase site.
007: See http://www.MMBase.org/license
008:
009: */
010: package org.mmbase.util.transformers;
011:
012: import java.io.*;
013: import org.mmbase.util.logging.*;
014:
015: /**
016: * Escapes and Unescapes undesirable characters using % (URLEncoding)
017: *
018: * Contrary to java.net.URLEncoder, it does <em>not</em> encode '+'.
019: *
020: * @author vpro (as org.mmbase.util.URLEscape, still present in SCAN application)
021: * @author Michiel Meeuwissen
022: * @version $Id: UrlEscaper.java,v 1.1 2007/07/24 09:57:43 michiel Exp $
023: */
024: public class UrlEscaper extends ReaderTransformer {
025:
026: private static final Logger log = Logging
027: .getLoggerInstance(UrlEscaper.class);
028:
029: private static final int BUF_SIZE = 100;
030: /**
031: * List for all ASCII characters whether it can be part of an
032: * URL line.
033: * http://www.ietf.org/rfc/rfc1808.txt
034: * unreserved = alpha | digit | safe | extra
035: * alpha = lowalpha | hialpha
036: * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
037: * "8" | "9"
038:
039: * safe = "$" | "-" | "_" | "." | "+"
040: * extra = "!" | "*" | "'" | "(" | ")" | ","
041:
042: * correspondes with 'unreserved', first entry is 32, space.
043: */
044: private static final boolean isacceptable[] = { false, true, false,
045: false, false, false, false, false, // !"#$%&'
046: true, true, true, true, true, true, true, false, // ()*+,-./
047: true, true, true, true, true, true, true, true, // 01234567
048: true, true, false, false, false, false, false, false, // 89:;<=>?
049: false, true, true, true, true, true, true, true, // @ABCDEFG
050: true, true, true, true, true, true, true, true, // HIJKLMNO
051: true, true, true, true, true, true, true, true, // PQRSTUVW
052: true, true, true, false, false, false, false, true, // XYZ[\]^_
053: true, true, true, true, true, true, true, true, // `abcdefg
054: true, true, true, true, true, true, true, true, // hijklmno
055: true, true, true, true, true, true, true, true, // pqrstuvw
056: true, true, true, false, false, false, false, false // xyz{|}~
057: };
058:
059: /**
060: * Hex characters
061: */
062: private static final char hex[] = { '0', '1', '2', '3', '4', '5',
063: '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
064:
065: /**
066: * Character to use for escaping invalid characters
067: */
068: private static final int HEX_ESCAPE = (int) '%';
069:
070: /**
071: * Escape a url.
072: * Replaces 'invalid characters' with their Escaped code, i.e.
073: * the questionmark (?) is escaped with %3F.
074: */
075: public Writer transform(Reader r, Writer w) {
076: escape(new BufferedInputStream(
077: new org.mmbase.util.ReaderInputStream(r, "UTF-8")), w);
078: return w;
079: }
080:
081: public static void escape(BufferedInputStream r, Writer w) {
082: byte[] buf = new byte[BUF_SIZE];
083: try {
084: int n = r.read(buf, 0, BUF_SIZE);
085: while (n > 0) {
086: for (int i = 0; i < n; i++) {
087: int a = (int) buf[i] & 0xff;
088: if (a >= 32 && a < 128 && isacceptable[a - 32]) {
089: w.write((char) a);
090: } else {
091: w.write(HEX_ESCAPE);
092: w.write(hex[a >> 4]);
093: w.write(hex[a & 15]);
094: }
095: }
096: n = r.read(buf, 0, BUF_SIZE);
097: }
098: } catch (IOException ioe) {
099: log.warn(ioe.getMessage(), ioe);
100: }
101: }
102:
103: /**
104: * converts a HEX-character to its approprtiate byte value.
105: * i.e. 'A' is returned as '/011'
106: * @param c the Hex character
107: * @return the byte value as a <code>char</code>
108: */
109: private static char from_hex(char c) {
110: return (char) (c >= '0' && c <= '9' ? c - '0' : c >= 'A'
111: && c <= 'F' ? c - 'A' + 10 : c - 'a' + 10); /* accept small letters just in case */
112: }
113:
114: /**
115: * Unescape a url.
116: * Replaces escapesequenced with the actual character.
117: * i.e %3F is replaced with the the questionmark (?).
118: * @param url the urls to unescape
119: * @return the unescaped url.
120: */
121: public Writer transformBack(Reader reader, Writer w) {
122: BufferedReader br = new BufferedReader(reader, BUF_SIZE);
123: // can do something with using a buffer and anticipate that you can need a few chars more
124: // (perhaps 3).
125:
126: try {
127: int t = br.read();
128:
129: while (t != -1) {
130: if (t == HEX_ESCAPE) {
131: int n = br.read();
132: if (n != -1) {
133: char j = (char) (from_hex((char) n) * 16);
134: int n2 = br.read();
135: if (n2 != -1) {
136: j += from_hex((char) n2);
137: w.write(j);
138: } else {
139: w.write(t);
140: w.write(n);
141: break;
142: }
143: } else {
144: w.write(t);
145: break;
146: }
147: } else {
148: w.write(t);
149: }
150: t = br.read();
151: }
152: } catch (IOException ioe) {
153: log.warn(ioe.getMessage(), ioe);
154: }
155: return w;
156:
157: }
158:
159: /**
160: * Method for testing this class from the command line
161: */
162: public static void main(String args[]) {
163: UrlEscaper e = new UrlEscaper();
164: for (int i = 0; i < args.length; i++) {
165: log.info("Original : '" + args[i] + "'");
166: String escaped = e.transform(args[i]);
167: log.info("Escaped : '" + escaped + "'");
168: log.info("Unescaped again : '" + e.transformBack(escaped)
169: + "'");
170: }
171:
172: }
173: }
|