001: /* IAURLCodec
002: *
003: * $Id: LaxURLCodec.java 4365 2006-07-18 00:40:16Z gojomo $
004: *
005: * Created on Jul 21, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.net;
026:
027: import java.io.ByteArrayOutputStream;
028: import java.io.UnsupportedEncodingException;
029: import java.util.BitSet;
030:
031: import org.apache.commons.codec.net.URLCodec;
032:
033: /**
034: * @author gojomo
035: */
036: public class LaxURLCodec extends URLCodec {
037: public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8");
038:
039: // passthrough constructor
040: public LaxURLCodec(String encoding) {
041: super (encoding);
042: }
043:
044: /**
045: * Decodes an array of URL safe 7-bit characters into an array of
046: * original bytes. Escaped characters are converted back to their
047: * original representation.
048: *
049: * Differs from URLCodec.decodeUrl() in that it throws no
050: * exceptions; bad or incomplete escape sequences are ignored
051: * and passed into result undecoded. This matches the behavior
052: * of browsers, which will use inconsistently-encoded URIs
053: * in HTTP request-lines.
054: *
055: * @param bytes array of URL safe characters
056: * @return array of original bytes
057: */
058: public static final byte[] decodeUrlLoose(byte[] bytes) {
059: if (bytes == null) {
060: return null;
061: }
062: ByteArrayOutputStream buffer = new ByteArrayOutputStream();
063: for (int i = 0; i < bytes.length; i++) {
064: int b = bytes[i];
065: if (b == '+') {
066: buffer.write(' ');
067: continue;
068: }
069: if (b == '%') {
070: if (i + 2 < bytes.length) {
071: int u = Character.digit((char) bytes[i + 1], 16);
072: int l = Character.digit((char) bytes[i + 2], 16);
073: if (u > -1 && l > -1) {
074: // good encoding
075: int c = ((u << 4) + l);
076: buffer.write((char) c);
077: i += 2;
078: continue;
079: } // else: bad encoding digits, leave '%' in place
080: } // else: insufficient encoding digits, leave '%' in place
081: }
082: buffer.write(b);
083: }
084: return buffer.toByteArray();
085: }
086:
087: /**
088: * A more expansive set of ASCII URI characters to consider as 'safe' to
089: * leave unencoded, based on actual browser behavior.
090: */
091: public static BitSet EXPANDED_URI_SAFE = new BitSet(256);
092: static {
093: // alpha characters
094: for (int i = 'a'; i <= 'z'; i++) {
095: EXPANDED_URI_SAFE.set(i);
096: }
097: for (int i = 'A'; i <= 'Z'; i++) {
098: EXPANDED_URI_SAFE.set(i);
099: }
100: // numeric characters
101: for (int i = '0'; i <= '9'; i++) {
102: EXPANDED_URI_SAFE.set(i);
103: }
104: // special chars
105: EXPANDED_URI_SAFE.set('-');
106: EXPANDED_URI_SAFE.set('~');
107: EXPANDED_URI_SAFE.set('_');
108: EXPANDED_URI_SAFE.set('.');
109: EXPANDED_URI_SAFE.set('*');
110: EXPANDED_URI_SAFE.set('/');
111: EXPANDED_URI_SAFE.set('=');
112: EXPANDED_URI_SAFE.set('&');
113: EXPANDED_URI_SAFE.set('+');
114: EXPANDED_URI_SAFE.set(',');
115: EXPANDED_URI_SAFE.set(':');
116: EXPANDED_URI_SAFE.set(';');
117: EXPANDED_URI_SAFE.set('@');
118: EXPANDED_URI_SAFE.set('$');
119: EXPANDED_URI_SAFE.set('!');
120: EXPANDED_URI_SAFE.set(')');
121: EXPANDED_URI_SAFE.set('(');
122: // experiments indicate: Firefox (1.0.6) never escapes '%'
123: EXPANDED_URI_SAFE.set('%');
124: // experiments indicate: Firefox (1.0.6) does not escape '|' or '''
125: EXPANDED_URI_SAFE.set('|');
126: EXPANDED_URI_SAFE.set('\'');
127: }
128:
129: public static BitSet QUERY_SAFE = new BitSet(256);
130: static {
131: QUERY_SAFE.or(EXPANDED_URI_SAFE);
132: // Tests indicate Firefox (1.0.7-1) doesn't escape curlies in query str.
133: QUERY_SAFE.set('{');
134: QUERY_SAFE.set('}');
135: // nor any of these: [ ] ^ ?
136: QUERY_SAFE.set('[');
137: QUERY_SAFE.set(']');
138: QUERY_SAFE.set('^');
139: QUERY_SAFE.set('?');
140: }
141:
142: /**
143: * Encodes a string into its URL safe form using the specified
144: * string charset. Unsafe characters are escaped.
145: *
146: * This method is analogous to superclass encode() methods,
147: * additionally offering the ability to specify a different
148: * 'safe' character set (such as EXPANDED_URI_SAFE).
149: *
150: * @param safe BitSet of characters that don't need to be encoded
151: * @param pString String to encode
152: * @param cs Name of character set to use
153: * @return Encoded version of <code>pString</code>.
154: * @throws UnsupportedEncodingException
155: */
156: public String encode(BitSet safe, String pString, String cs)
157: throws UnsupportedEncodingException {
158: if (pString == null) {
159: return null;
160: }
161: return new String(encodeUrl(safe, pString.getBytes(cs)),
162: "US-ASCII");
163: }
164: }
|