001: /* LaxURI
002: *
003: * $Id: LaxURI.java 4646 2006-09-22 17:23:04Z paul_jack $
004: *
005: * Created on Aug 3, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.net;
026:
027: import java.util.Arrays;
028: import java.util.BitSet;
029:
030: import org.apache.commons.httpclient.URI;
031: import org.apache.commons.httpclient.URIException;
032: import org.apache.commons.httpclient.util.EncodingUtil;
033:
034: /**
035: * URI subclass which allows partial/inconsistent encoding, matching
036: * the URIs which will be relayed in requests from popular web
037: * browsers (esp. Mozilla Firefox and MS IE).
038: *
039: * @author gojomo
040: */
041: public class LaxURI extends URI {
042:
043: private static final long serialVersionUID = 5273922211722239537L;
044:
045: final protected static char[] HTTP_SCHEME = { 'h', 't', 't', 'p' };
046: final protected static char[] HTTPS_SCHEME = { 'h', 't', 't', 'p',
047: 's' };
048:
049: protected static final BitSet lax_rel_segment = new BitSet(256);
050: // Static initializer for lax_rel_segment
051: static {
052: lax_rel_segment.or(rel_segment);
053: lax_rel_segment.set(':'); // allow ':'
054: // TODO: add additional allowances as need is demonstrated
055: }
056:
057: protected static final BitSet lax_abs_path = new BitSet(256);
058: static {
059: lax_abs_path.or(abs_path);
060: lax_abs_path.set('|'); // tests indicate Firefox (1.0.6) doesn't escape.
061: }
062:
063: protected static final BitSet lax_query = new BitSet(256);
064: static {
065: lax_query.or(query);
066: lax_query.set('{'); // tests indicate FF doesn't escape { in query
067: lax_query.set('}'); // tests indicate FF doesn't escape } in query
068: lax_query.set('|'); // tests indicate FF doesn't escape | in query
069: lax_query.set('['); // tests indicate FF doesn't escape [ in query
070: lax_query.set(']'); // tests indicate FF doesn't escape ] in query
071: lax_query.set('^'); // tests indicate FF doesn't escape ^ in query
072: }
073:
074: // passthrough initializers
075: public LaxURI(String uri, boolean escaped, String charset)
076: throws URIException {
077: super (uri, escaped, charset);
078: }
079:
080: public LaxURI(URI base, URI relative) throws URIException {
081: super (base, relative);
082: }
083:
084: public LaxURI(String uri, boolean escaped) throws URIException {
085: super (uri, escaped);
086: }
087:
088: public LaxURI() {
089: super ();
090: }
091:
092: // overridden to use this class's static decode()
093: public String getURI() throws URIException {
094: return (_uri == null) ? null : decode(_uri,
095: getProtocolCharset());
096: }
097:
098: // overridden to use this class's static decode()
099: public String getPath() throws URIException {
100: char[] p = getRawPath();
101: return (p == null) ? null : decode(p, getProtocolCharset());
102: }
103:
104: // overridden to use this class's static decode()
105: public String getPathQuery() throws URIException {
106: char[] rawPathQuery = getRawPathQuery();
107: return (rawPathQuery == null) ? null : decode(rawPathQuery,
108: getProtocolCharset());
109: }
110:
111: // overridden to use this class's static decode()
112: protected static String decode(char[] component, String charset)
113: throws URIException {
114: if (component == null) {
115: throw new IllegalArgumentException(
116: "Component array of chars may not be null");
117: }
118: return decode(new String(component), charset);
119: }
120:
121: // overridden to use IA's LaxURLCodec, which never throws DecoderException
122: protected static String decode(String component, String charset)
123: throws URIException {
124: if (component == null) {
125: throw new IllegalArgumentException(
126: "Component array of chars may not be null");
127: }
128: byte[] rawdata = null;
129: // try {
130: rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil
131: .getAsciiBytes(component));
132: // } catch (DecoderException e) {
133: // throw new URIException(e.getMessage());
134: // }
135: return EncodingUtil.getString(rawdata, charset);
136: }
137:
138: // overidden to lax() the acceptable-char BitSet passed in
139: protected boolean validate(char[] component, BitSet generous) {
140: return super .validate(component, lax(generous));
141: }
142:
143: // overidden to lax() the acceptable-char BitSet passed in
144: protected boolean validate(char[] component, int soffset,
145: int eoffset, BitSet generous) {
146: return super .validate(component, soffset, eoffset,
147: lax(generous));
148: }
149:
150: /**
151: * Given a BitSet -- typically one of the URI superclass's
152: * predefined static variables -- possibly replace it with
153: * a more-lax version to better match the character sets
154: * actually left unencoded in web browser requests
155: *
156: * @param generous original BitSet
157: * @return (possibly more lax) BitSet to use
158: */
159: protected BitSet lax(BitSet generous) {
160: if (generous == rel_segment) {
161: // Swap in more lax allowable set
162: return lax_rel_segment;
163: }
164: if (generous == abs_path) {
165: return lax_abs_path;
166: }
167: if (generous == query) {
168: return lax_query;
169: }
170: // otherwise, leave as is
171: return generous;
172: }
173:
174: /**
175: * Coalesce the _host and _authority fields where
176: * possible.
177: *
178: * In the web crawl/http domain, most URIs have an
179: * identical _host and _authority. (There is no port
180: * or user info.) However, the superclass always
181: * creates two separate char[] instances.
182: *
183: * Notably, the lengths of these char[] fields are
184: * equal if and only if their values are identical.
185: * This method makes use of this fact to reduce the
186: * two instances to one where possible, slimming
187: * instances.
188: *
189: * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)
190: */
191: protected void parseAuthority(String original, boolean escaped)
192: throws URIException {
193: super .parseAuthority(original, escaped);
194: if (_host != null && _authority != null
195: && _host.length == _authority.length) {
196: _host = _authority;
197: }
198: }
199:
200: /**
201: * Coalesce _scheme to existing instances, where appropriate.
202: *
203: * In the web-crawl domain, most _schemes are 'http' or 'https',
204: * but the superclass always creates a new char[] instance. For
205: * these two cases, we replace the created instance with a
206: * long-lived instance from a static field, saving 12-14 bytes
207: * per instance.
208: *
209: * @see org.apache.commons.httpclient.URI#setURI()
210: */
211: protected void setURI() {
212: if (_scheme != null) {
213: if (_scheme.length == 4
214: && Arrays.equals(_scheme, HTTP_SCHEME)) {
215: _scheme = HTTP_SCHEME;
216: } else if (_scheme.length == 5
217: && Arrays.equals(_scheme, HTTP_SCHEME)) {
218: _scheme = HTTPS_SCHEME;
219: }
220: }
221: super .setURI();
222: }
223:
224: /**
225: * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR
226: * http://issues.apache.org/jira/browse/HTTPCLIENT-588
227: *
228: * In order to avoid any possilbity of conflict with non-ASCII characters,
229: * Parse a URI reference as a <code>String</code> with the character
230: * encoding of the local system or the document.
231: * <p>
232: * The following line is the regular expression for breaking-down a URI
233: * reference into its components.
234: * <p><blockquote><pre>
235: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
236: * 12 3 4 5 6 7 8 9
237: * </pre></blockquote><p>
238: * For example, matching the above expression to
239: * http://jakarta.apache.org/ietf/uri/#Related
240: * results in the following subexpression matches:
241: * <p><blockquote><pre>
242: * $1 = http:
243: * scheme = $2 = http
244: * $3 = //jakarta.apache.org
245: * authority = $4 = jakarta.apache.org
246: * path = $5 = /ietf/uri/
247: * $6 = <undefined>
248: * query = $7 = <undefined>
249: * $8 = #Related
250: * fragment = $9 = Related
251: * </pre></blockquote><p>
252: *
253: * @param original the original character sequence
254: * @param escaped <code>true</code> if <code>original</code> is escaped
255: * @throws URIException If an error occurs.
256: */
257: protected void parseUriReference(String original, boolean escaped)
258: throws URIException {
259:
260: // validate and contruct the URI character sequence
261: if (original == null) {
262: throw new URIException("URI-Reference required");
263: }
264:
265: /* @
266: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
267: */
268: String tmp = original.trim();
269:
270: /*
271: * The length of the string sequence of characters.
272: * It may not be equal to the length of the byte array.
273: */
274: int length = tmp.length();
275:
276: /*
277: * Remove the delimiters like angle brackets around an URI.
278: */
279: if (length > 0) {
280: char[] firstDelimiter = { tmp.charAt(0) };
281: if (validate(firstDelimiter, delims)) {
282: if (length >= 2) {
283: char[] lastDelimiter = { tmp.charAt(length - 1) };
284: if (validate(lastDelimiter, delims)) {
285: tmp = tmp.substring(1, length - 1);
286: length = length - 2;
287: }
288: }
289: }
290: }
291:
292: /*
293: * The starting index
294: */
295: int from = 0;
296:
297: /*
298: * The test flag whether the URI is started from the path component.
299: */
300: boolean isStartedFromPath = false;
301: int atColon = tmp.indexOf(':');
302: int atSlash = tmp.indexOf('/');
303: if ((atColon <= 0 && !tmp.startsWith("//"))
304: || (atSlash >= 0 && atSlash < atColon)) {
305: isStartedFromPath = true;
306: }
307:
308: /*
309: * <p><blockquote><pre>
310: * @@@@@@@@
311: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
312: * </pre></blockquote><p>
313: */
314: int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#",
315: from);
316: if (at == -1) {
317: at = 0;
318: }
319:
320: /*
321: * Parse the scheme.
322: * <p><blockquote><pre>
323: * scheme = $2 = http
324: * @
325: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
326: * </pre></blockquote><p>
327: */
328: if (at > 0 && at < length && tmp.charAt(at) == ':') {
329: char[] target = tmp.substring(0, at).toLowerCase()
330: .toCharArray();
331: if (validate(target, scheme)) {
332: _scheme = target;
333: } else {
334: throw new URIException("incorrect scheme");
335: }
336: from = ++at;
337: }
338:
339: /*
340: * Parse the authority component.
341: * <p><blockquote><pre>
342: * authority = $4 = jakarta.apache.org
343: * @@
344: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
345: * </pre></blockquote><p>
346: */
347: // Reset flags
348: _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
349: if (0 <= at && at < length && tmp.charAt(at) == '/') {
350: // Set flag
351: _is_hier_part = true;
352: if (at + 2 < length && tmp.charAt(at + 1) == '/'
353: && !isStartedFromPath) {
354: // the temporary index to start the search from
355: int next = indexFirstOf(tmp, "/?#", at + 2);
356: if (next == -1) {
357: next = (tmp.substring(at + 2).length() == 0) ? at + 2
358: : tmp.length();
359: }
360: parseAuthority(tmp.substring(at + 2, next), escaped);
361: from = at = next;
362: // Set flag
363: _is_net_path = true;
364: }
365: if (from == at) {
366: // Set flag
367: _is_abs_path = true;
368: }
369: }
370:
371: /*
372: * Parse the path component.
373: * <p><blockquote><pre>
374: * path = $5 = /ietf/uri/
375: * @@@@@@
376: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
377: * </pre></blockquote><p>
378: */
379: if (from < length) {
380: // rel_path = rel_segment [ abs_path ]
381: int next = indexFirstOf(tmp, "?#", from);
382: if (next == -1) {
383: next = tmp.length();
384: }
385: if (!_is_abs_path) {
386: if (!escaped
387: && prevalidate(tmp.substring(from, next),
388: disallowed_rel_path)
389: || escaped
390: && validate(tmp.substring(from, next)
391: .toCharArray(), rel_path)) {
392: // Set flag
393: _is_rel_path = true;
394: } else if (!escaped
395: && prevalidate(tmp.substring(from, next),
396: disallowed_opaque_part)
397: || escaped
398: && validate(tmp.substring(from, next)
399: .toCharArray(), opaque_part)) {
400: // Set flag
401: _is_opaque_part = true;
402: } else {
403: // the path component may be empty
404: _path = null;
405: }
406: }
407: String s = tmp.substring(from, next);
408: if (escaped) {
409: setRawPath(s.toCharArray());
410: } else {
411: setPath(s);
412: }
413: at = next;
414: }
415:
416: // set the charset to do escape encoding
417: String charset = getProtocolCharset();
418:
419: /*
420: * Parse the query component.
421: * <p><blockquote><pre>
422: * query = $7 = <undefined>
423: * @@@@@@@@@
424: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
425: * </pre></blockquote><p>
426: */
427: if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
428: int next = tmp.indexOf('#', at + 1);
429: if (next == -1) {
430: next = tmp.length();
431: }
432: if (escaped) {
433: _query = tmp.substring(at + 1, next).toCharArray();
434: if (!validate(_query, query)) {
435: throw new URIException("Invalid query");
436: }
437: } else {
438: _query = encode(tmp.substring(at + 1, next),
439: allowed_query, charset);
440: }
441: at = next;
442: }
443:
444: /*
445: * Parse the fragment component.
446: * <p><blockquote><pre>
447: * fragment = $9 = Related
448: * @@@@@@@@
449: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
450: * </pre></blockquote><p>
451: */
452: if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
453: if (at + 1 == length) { // empty fragment
454: _fragment = "".toCharArray();
455: } else {
456: _fragment = (escaped) ? tmp.substring(at + 1)
457: .toCharArray() : encode(tmp.substring(at + 1),
458: allowed_fragment, charset);
459: }
460: }
461:
462: // set this URI.
463: setURI();
464: }
465:
466: }
|