001: /* UURIFactory
002: *
003: * $Id: UURIFactory.java 5106 2007-05-01 00:07:29Z gojomo $
004: *
005: * Created on July 16, 2004
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.net;
026:
027: import gnu.inet.encoding.IDNA;
028: import gnu.inet.encoding.IDNAException;
029: import it.unimi.dsi.mg4j.util.MutableString;
030:
031: import java.io.UnsupportedEncodingException;
032: import java.util.Arrays;
033: import java.util.BitSet;
034: import java.util.logging.Level;
035: import java.util.logging.Logger;
036: import java.util.regex.Matcher;
037: import java.util.regex.Pattern;
038:
039: import org.apache.commons.httpclient.URI;
040: import org.apache.commons.httpclient.URIException;
041: import org.archive.util.TextUtils;
042:
043: /**
044: * Factory that returns UURIs.
045: *
046: * Does escaping and fixup on URIs massaging in accordance with RFC2396
047: * and to match browser practice. For example, it removes any
048: * '..' if first thing in the path as per IE, converts backslashes to forward
049: * slashes, and discards any 'fragment'/anchor portion of the URI. This
050: * class will also fail URIs if they are longer than IE's allowed maximum
051: * length.
052: *
053: * <p>TODO: Test logging.
054: *
055: * @author stack
056: */
057: public class UURIFactory extends URI {
058:
059: private static final long serialVersionUID = -6146295130382209042L;
060:
061: /**
062: * Logging instance.
063: */
064: private static Logger logger = Logger.getLogger(UURIFactory.class
065: .getName());
066:
067: /**
068: * The single instance of this factory.
069: */
070: private static final UURIFactory factory = new UURIFactory();
071:
072: /**
073: * RFC 2396-inspired regex.
074: *
075: * From the RFC Appendix B:
076: * <pre>
077: * URI Generic Syntax August 1998
078: *
079: * B. Parsing a URI Reference with a Regular Expression
080: *
081: * As described in Section 4.3, the generic URI syntax is not sufficient
082: * to disambiguate the components of some forms of URI. Since the
083: * "greedy algorithm" described in that section is identical to the
084: * disambiguation method used by POSIX regular expressions, it is
085: * natural and commonplace to use a regular expression for parsing the
086: * potential four components and fragment identifier of a URI reference.
087: *
088: * The following line is the regular expression for breaking-down a URI
089: * reference into its components.
090: *
091: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
092: * 12 3 4 5 6 7 8 9
093: *
094: * The numbers in the second line above are only to assist readability;
095: * they indicate the reference points for each subexpression (i.e., each
096: * paired parenthesis). We refer to the value matched for subexpression
097: * <n> as $<n>. For example, matching the above expression to
098: *
099: * http://www.ics.uci.edu/pub/ietf/uri/#Related
100: *
101: * results in the following subexpression matches:
102: *
103: * $1 = http:
104: * $2 = http
105: * $3 = //www.ics.uci.edu
106: * $4 = www.ics.uci.edu
107: * $5 = /pub/ietf/uri/
108: * $6 = <undefined>
109: * $7 = <undefined>
110: * $8 = #Related
111: * $9 = Related
112: *
113: * where <undefined> indicates that the component is not present, as is
114: * the case for the query component in the above example. Therefore, we
115: * can determine the value of the four components and fragment as
116: *
117: * scheme = $2
118: * authority = $4
119: * path = $5
120: * query = $7
121: * fragment = $9
122: * </pre>
123: *
124: * --
125: * <p>Below differs from the rfc regex in that it has java escaping of
126: * regex characters and we allow a URI made of a fragment only (Added extra
127: * group so indexing is off by one after scheme).
128: */
129: final static Pattern RFC2396REGEX = Pattern
130: .compile("^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?");
131: // 12 34 5 6 7 8 9 A
132: // 2 1 54 6 87 3 A9
133: // 1: scheme
134: // 2: scheme:
135: // 3: //authority/path
136: // 4: //authority
137: // 5: authority
138: // 6: path
139: // 7: ?query
140: // 8: query
141: // 9: #fragment
142: // A: fragment
143:
144: public static final String SLASHDOTDOTSLASH = "^(/\\.\\./)+";
145: public static final String SLASH = "/";
146: public static final String HTTP = "http";
147: public static final String HTTP_PORT = ":80";
148: public static final String HTTPS = "https";
149: public static final String HTTPS_PORT = ":443";
150: public static final String DOT = ".";
151: public static final String EMPTY_STRING = "";
152: public static final String NBSP = "\u00A0";
153: public static final String SPACE = " ";
154: public static final String ESCAPED_SPACE = "%20";
155: public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$";
156: public static final String PIPE = "|";
157: public static final String PIPE_PATTERN = "\\|";
158: public static final String ESCAPED_PIPE = "%7C";
159: public static final String CIRCUMFLEX = "^";
160: public static final String CIRCUMFLEX_PATTERN = "\\^";
161: public static final String ESCAPED_CIRCUMFLEX = "%5E";
162: public static final String QUOT = "\"";
163: public static final String ESCAPED_QUOT = "%22";
164: public static final String SQUOT = "'";
165: public static final String ESCAPED_SQUOT = "%27";
166: public static final String APOSTROPH = "`";
167: public static final String ESCAPED_APOSTROPH = "%60";
168: public static final String LSQRBRACKET = "[";
169: public static final String LSQRBRACKET_PATTERN = "\\[";
170: public static final String ESCAPED_LSQRBRACKET = "%5B";
171: public static final String RSQRBRACKET = "]";
172: public static final String RSQRBRACKET_PATTERN = "\\]";
173: public static final String ESCAPED_RSQRBRACKET = "%5D";
174: public static final String LCURBRACKET = "{";
175: public static final String LCURBRACKET_PATTERN = "\\{";
176: public static final String ESCAPED_LCURBRACKET = "%7B";
177: public static final String RCURBRACKET = "}";
178: public static final String RCURBRACKET_PATTERN = "\\}";
179: public static final String ESCAPED_RCURBRACKET = "%7D";
180: public static final String BACKSLASH = "\\";
181: public static final String BACKSLASH_PATTERN = "\\\\";
182: public static final String ESCAPED_BACKSLASH = "%5C";
183: public static final String STRAY_SPACING = "[\n\r\t]+";
184: public static final String IMPROPERESC_REPLACE = "%25$1";
185: public static final String IMPROPERESC = "%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))";
186: public static final String COMMERCIAL_AT = "@";
187: public static final char PERCENT_SIGN = '%';
188: public static final char COLON = ':';
189:
190: /**
191: * First percent sign in string followed by two hex chars.
192: */
193: public static final String URI_HEX_ENCODING = "^[^%]*%[\\p{XDigit}][\\p{XDigit}].*";
194:
195: /**
196: * Authority port number regex.
197: */
198: final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$");
199:
200: /**
201: * Characters we'll accept in the domain label part of a URI
202: * authority: ASCII letters-digits-hyphen (LDH) plus underscore,
203: * with single intervening '.' characters.
204: *
205: * (We accept '_' because DNS servers have tolerated for many
206: * years counter to spec; we also accept dash patterns and ACE
207: * prefixes that will be rejected by IDN-punycoding attempt.)
208: */
209: final static String ACCEPTABLE_ASCII_DOMAIN = "^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$";
210:
211: /**
212: * Pattern that looks for case of three or more slashes after the
213: * scheme. If found, we replace them with two only as mozilla does.
214: */
215: final static Pattern HTTP_SCHEME_SLASHES = Pattern
216: .compile("^(https?://)/+(.*)");
217:
218: /**
219: * Pattern that looks for case of two or more slashes in a path.
220: */
221: final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+");
222:
223: /**
224: * System property key for list of supported schemes.
225: */
226: private static final String SCHEMES_KEY = ".schemes";
227:
228: /**
229: * System property key for list of purposefully-ignored schemes.
230: */
231: private static final String IGNORED_SCHEMES_KEY = ".ignored-schemes";
232:
233: private String[] schemes = null;
234: private String[] ignoredSchemes = null;
235:
236: public static final int IGNORED_SCHEME = 9999999;
237:
238: /**
239: * Protected constructor.
240: */
241: private UURIFactory() {
242: super ();
243: String s = System.getProperty(this .getClass().getName()
244: + SCHEMES_KEY);
245: if (s != null && s.length() > 0) {
246: schemes = s.split("[, ]+");
247: Arrays.sort(schemes);
248: }
249: String ignored = System.getProperty(this .getClass().getName()
250: + IGNORED_SCHEMES_KEY);
251: if (ignored != null && ignored.length() > 0) {
252: ignoredSchemes = ignored.split("[, ]+");
253: Arrays.sort(ignoredSchemes);
254: }
255: }
256:
257: /**
258: * @param uri URI as string.
259: * @return An instance of UURI
260: * @throws URIException
261: */
262: public static UURI getInstance(String uri) throws URIException {
263: return UURIFactory.factory.create(uri);
264: }
265:
266: /**
267: * @param uri URI as string.
268: * @param charset Character encoding of the passed uri string.
269: * @return An instance of UURI
270: * @throws URIException
271: */
272: public static UURI getInstance(String uri, String charset)
273: throws URIException {
274: return UURIFactory.factory.create(uri, charset);
275: }
276:
277: /**
278: * @param base Base uri to use resolving passed relative uri.
279: * @param relative URI as string.
280: * @return An instance of UURI
281: * @throws URIException
282: */
283: public static UURI getInstance(UURI base, String relative)
284: throws URIException {
285: return UURIFactory.factory.create(base, relative);
286: }
287:
288: /**
289: * Test of whether passed String has an allowed URI scheme.
290: * First tests if likely scheme suffix. If so, we then test if its one of
291: * the supported schemes.
292: * @param possibleUrl URL string to examine.
293: * @return True if passed string looks like it could be an URL.
294: */
295: public static boolean hasSupportedScheme(String possibleUrl) {
296: boolean hasScheme = UURI.hasScheme(possibleUrl);
297: if (!hasScheme || UURIFactory.factory.schemes == null) {
298: return hasScheme;
299: }
300: String tmpStr = possibleUrl.substring(0, possibleUrl
301: .indexOf(':'));
302: return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0;
303: }
304:
305: /**
306: * @param uri URI as string.
307: * @return Instance of UURI.
308: * @throws URIException
309: */
310: private UURI create(String uri) throws URIException {
311: return create(uri, UURI.getDefaultProtocolCharset());
312: }
313:
314: /**
315: * @param uri URI as string.
316: * @param charset Original encoding of the string.
317: * @return Instance of UURI.
318: * @throws URIException
319: */
320: private UURI create(String uri, String charset) throws URIException {
321: UURI uuri = new UURI(fixup(uri, null, charset), true, charset);
322: if (logger.isLoggable(Level.FINE)) {
323: logger.fine("URI " + uri + " PRODUCT " + uuri.toString()
324: + " CHARSET " + charset);
325: }
326: return validityCheck(uuri);
327: }
328:
329: /**
330: * @param base UURI to use as a base resolving <code>relative</code>.
331: * @param relative Relative URI.
332: * @return Instance of UURI.
333: * @throws URIException
334: */
335: private UURI create(UURI base, String relative) throws URIException {
336: UURI uuri = new UURI(base,
337: new UURI(fixup(relative, base, base
338: .getProtocolCharset()), true, base
339: .getProtocolCharset()));
340: if (logger.isLoggable(Level.FINE)) {
341: logger.fine(" URI " + relative + " PRODUCT "
342: + uuri.toString() + " CHARSET "
343: + base.getProtocolCharset() + " BASE " + base);
344: }
345: return validityCheck(uuri);
346: }
347:
348: /**
349: * Check the generated UURI.
350: *
351: * At the least look at length of uuri string. We were seeing case
352: * where before escaping, string was < MAX_URL_LENGTH but after was
353: * >. Letting out a too-big message was causing us troubles later
354: * down the processing chain.
355: * @param uuri Created uuri to check.
356: * @return The passed <code>uuri</code> so can easily inline this check.
357: * @throws URIException
358: */
359: protected UURI validityCheck(UURI uuri) throws URIException {
360: if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) {
361: throw new URIException("Created (escaped) uuri > "
362: + UURI.MAX_URL_LENGTH + ": " + uuri.toString());
363: }
364: return uuri;
365: }
366:
367: /**
368: * Do heritrix fix-up on passed uri string.
369: *
370: * Does heritrix escaping; usually escaping done to make our behavior align
371: * with IEs. This method codifies our experience pulling URIs from the
372: * wilds. Its does all the escaping we want; its output can always be
373: * assumed to be 'escaped' (though perhaps to a laxer standard than the
374: * vanilla HttpClient URI class or official specs might suggest).
375: *
376: * @param uri URI as string.
377: * @param base May be null.
378: * @param e True if the uri is already escaped.
379: * @return A fixed up URI string.
380: * @throws URIException
381: */
382: private String fixup(String uri, final URI base,
383: final String charset) throws URIException {
384: if (uri == null) {
385: throw new NullPointerException();
386: } else if (uri.length() == 0 && base == null) {
387: throw new URIException(
388: "URI length is zero (and not relative).");
389: }
390:
391: if (uri.length() > UURI.MAX_URL_LENGTH) {
392: // We check length here and again later after all convertions.
393: throw new URIException("URI length > "
394: + UURI.MAX_URL_LENGTH + ": " + uri);
395: }
396:
397: // Replace nbsp with normal spaces (so that they get stripped if at
398: // ends, or encoded if in middle)
399: if (uri.indexOf(NBSP) >= 0) {
400: uri = TextUtils.replaceAll(NBSP, uri, SPACE);
401: }
402:
403: // Get rid of any trailing spaces or new-lines.
404: uri = uri.trim();
405:
406: // IE actually converts backslashes to slashes rather than to %5C.
407: // Since URIs that have backslashes usually work only with IE, we will
408: // convert backslashes to slashes as well.
409: // TODO: Maybe we can first convert backslashes by specs and than by IE
410: // so that we fetch both versions.
411: if (uri.indexOf(BACKSLASH) >= 0) {
412: uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH);
413: }
414:
415: // Remove stray TAB/CR/LF
416: uri = TextUtils.replaceAll(STRAY_SPACING, uri, EMPTY_STRING);
417:
418: // Test for the case of more than two slashes after the http(s) scheme.
419: // Replace with two slashes as mozilla does if found.
420: // See [ 788219 ] URI Syntax Errors stop page parsing.
421: Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri);
422: if (matcher.matches()) {
423: uri = matcher.group(1) + matcher.group(2);
424: }
425:
426: // now, minimally escape any whitespace
427: uri = escapeWhitespace(uri);
428:
429: // For further processing, get uri elements. See the RFC2396REGEX
430: // comment above for explaination of group indices used in the below.
431: matcher = RFC2396REGEX.matcher(uri);
432: if (!matcher.matches()) {
433: throw new URIException("Failed parse of " + uri);
434: }
435: String uriScheme = checkUriElementAndLowerCase(matcher.group(2));
436: String uriSchemeSpecificPart = checkUriElement(matcher.group(3));
437: String uriAuthority = checkUriElement(matcher.group(5));
438: String uriPath = checkUriElement(matcher.group(6));
439: String uriQuery = checkUriElement(matcher.group(8));
440: // UNUSED String uriFragment = checkUriElement(matcher.group(10));
441:
442: // If a scheme, is it a supported scheme?
443: if (uriScheme != null && uriScheme.length() > 0
444: && this .schemes != null) {
445: if (!(Arrays.binarySearch(schemes, uriScheme) >= 0)) {
446: // unsupported; see if silently ignored
447: if ((Arrays.binarySearch(ignoredSchemes, uriScheme) >= 0)) {
448: throw new URIException(IGNORED_SCHEME,
449: "Ignored scheme: " + uriScheme);
450: } else {
451: throw new URIException("Unsupported scheme: "
452: + uriScheme);
453: }
454: }
455: }
456:
457: // Test if relative URI. If so, need a base to resolve against.
458: if (uriScheme == null || uriScheme.length() <= 0) {
459: if (base == null) {
460: throw new URIException("Relative URI but no base: "
461: + uri);
462: }
463: } else {
464: checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme,
465: uriSchemeSpecificPart);
466: }
467:
468: // fixup authority portion: lowercase/IDN-punycode any domain;
469: // remove stray trailing spaces
470: uriAuthority = fixupAuthority(uriAuthority);
471:
472: // Do some checks if absolute path.
473: if (uriSchemeSpecificPart != null
474: && uriSchemeSpecificPart.startsWith(SLASH)) {
475: if (uriPath != null) {
476: // Eliminate '..' if its first thing in the path. IE does this.
477: uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH,
478: uriPath, SLASH);
479: }
480: // Ensure root URLs end with '/': browsers always send "/"
481: // on the request-line, so we should consider "http://host"
482: // to be "http://host/".
483: if (uriPath == null || EMPTY_STRING.equals(uriPath)) {
484: uriPath = SLASH;
485: }
486: }
487:
488: if (uriAuthority != null) {
489: if (uriScheme != null && uriScheme.length() > 0
490: && uriScheme.equals(HTTP)) {
491: uriAuthority = checkPort(uriAuthority);
492: uriAuthority = stripTail(uriAuthority, HTTP_PORT);
493: } else if (uriScheme != null && uriScheme.length() > 0
494: && uriScheme.equals(HTTPS)) {
495: uriAuthority = checkPort(uriAuthority);
496: uriAuthority = stripTail(uriAuthority, HTTPS_PORT);
497: }
498: // Strip any prefix dot or tail dots from the authority.
499: uriAuthority = stripTail(uriAuthority, DOT);
500: uriAuthority = stripPrefix(uriAuthority, DOT);
501: } else {
502: // no authority; may be relative. consider stripping scheme
503: // to work-around org.apache.commons.httpclient.URI bug
504: // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 )
505: if (uriScheme != null && base != null
506: && uriScheme.equals(base.getScheme())) {
507: // uriScheme redundant and will only confound httpclient.URI
508: uriScheme = null;
509: }
510: }
511:
512: // Ensure minimal escaping. Use of 'lax' URI and URLCodec
513: // means minimal escaping isn't necessarily complete/consistent.
514: // There is a chance such lax encoding will throw exceptions
515: // later at inconvenient times.
516: //
517: // One reason for these bad escapings -- though not the only --
518: // is that the page is using an encoding other than the ASCII or the
519: // UTF-8 that is our default URI encoding. In this case the parent
520: // class is burping on the passed URL encoding. If the page encoding
521: // was passed into this factory, the encoding seems to be parsed
522: // correctly (See the testEscapedEncoding unit test).
523: //
524: // This fixup may cause us to miss content. There is the charset case
525: // noted above. TODO: Look out for cases where we fail other than for
526: // the above given reason which will be fixed when we address
527: // '[ 913687 ] Make extractors interrogate for charset'.
528:
529: uriPath = ensureMinimalEscaping(uriPath, charset);
530: uriQuery = ensureMinimalEscaping(uriQuery, charset,
531: LaxURLCodec.QUERY_SAFE);
532:
533: // Preallocate. The '1's and '2's in below are space for ':',
534: // '//', etc. URI characters.
535: MutableString s = new MutableString(
536: ((uriScheme != null) ? uriScheme.length() : 0)
537: + 1 // ';'
538: + ((uriAuthority != null) ? uriAuthority
539: .length() : 0)
540: + 2 // '//'
541: + ((uriPath != null) ? uriPath.length() : 0)
542: + 1 // '?'
543: + ((uriQuery != null) ? uriQuery.length() : 0));
544: appendNonNull(s, uriScheme, ":", true);
545: appendNonNull(s, uriAuthority, "//", false);
546: appendNonNull(s, uriPath, "", false);
547: appendNonNull(s, uriQuery, "?", false);
548: return s.toString();
549: }
550:
551: /**
552: * If http(s) scheme, check scheme specific part begins '//'.
553: * @throws URIException
554: * @see http://www.faqs.org/rfcs/rfc1738.html Section 3.1. Common Internet
555: * Scheme Syntax
556: */
557: protected void checkHttpSchemeSpecificPartSlashPrefix(
558: final URI base, final String scheme,
559: final String schemeSpecificPart) throws URIException {
560: // Only apply this check if no base.
561: if (base != null) {
562: return;
563: }
564: if (scheme == null || scheme.length() <= 0) {
565: return;
566: }
567: if (!scheme.equals("http") && !scheme.equals("https")) {
568: return;
569: }
570: if (!schemeSpecificPart.startsWith("//")) {
571: throw new URIException("http scheme specific part must "
572: + "begin '//': " + schemeSpecificPart);
573: }
574: if (schemeSpecificPart.length() <= 2) {
575: throw new URIException("http scheme specific part is "
576: + "too short: " + schemeSpecificPart);
577: }
578: }
579:
580: /**
581: * Fixup 'authority' portion of URI, by removing any stray
582: * encoded spaces, lowercasing any domain names, and applying
583: * IDN-punycoding to Unicode domains.
584: *
585: * @param uriAuthority the authority string to fix
586: * @return fixed version
587: * @throws URIException
588: */
589: private String fixupAuthority(String uriAuthority)
590: throws URIException {
591: // Lowercase the host part of the uriAuthority; don't destroy any
592: // userinfo capitalizations. Make sure no illegal characters in
593: // domainlabel substring of the uri authority.
594: if (uriAuthority != null) {
595: // Get rid of any trailing escaped spaces:
596: // http://www.archive.org%20. Rare but happens.
597: // TODO: reevaluate: do IE or firefox do such mid-URI space-removal?
598: // if not, we shouldn't either.
599: while (uriAuthority.endsWith(ESCAPED_SPACE)) {
600: uriAuthority = uriAuthority.substring(0, uriAuthority
601: .length() - 3);
602: }
603:
604: // lowercase & IDN-punycode only the domain portion
605: int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
606: int portColonIndex = uriAuthority.indexOf(COLON,
607: (atIndex < 0) ? 0 : atIndex);
608: if (atIndex < 0 && portColonIndex < 0) {
609: // most common case: neither userinfo nor port
610: return fixupDomainlabel(uriAuthority);
611: } else if (atIndex < 0 && portColonIndex > -1) {
612: // next most common: port but no userinfo
613: String domain = fixupDomainlabel(uriAuthority
614: .substring(0, portColonIndex));
615: String port = uriAuthority.substring(portColonIndex);
616: return domain + port;
617: } else if (atIndex > -1 && portColonIndex < 0) {
618: // uncommon: userinfo, no port
619: String userinfo = uriAuthority
620: .substring(0, atIndex + 1);
621: String domain = fixupDomainlabel(uriAuthority
622: .substring(atIndex + 1));
623: return userinfo + domain;
624: } else {
625: // uncommon: userinfo, port
626: String userinfo = uriAuthority
627: .substring(0, atIndex + 1);
628: String domain = fixupDomainlabel(uriAuthority
629: .substring(atIndex + 1, portColonIndex));
630: String port = uriAuthority.substring(portColonIndex);
631: return userinfo + domain + port;
632: }
633: }
634: return uriAuthority;
635: }
636:
637: /**
638: * Fixup the domain label part of the authority.
639: *
640: * We're more lax than the spec. in that we allow underscores.
641: *
642: * @param label Domain label to fix.
643: * @return Return fixed domain label.
644: * @throws URIException
645: */
646: private String fixupDomainlabel(String label) throws URIException {
647:
648: // apply IDN-punycoding, as necessary
649: try {
650: // TODO: optimize: only apply when necessary, or
651: // keep cache of recent encodings
652: label = IDNA.toASCII(label);
653: } catch (IDNAException e) {
654: if (TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN, label)) {
655: // domain name has ACE prefix, leading/trailing dash, or
656: // underscore -- but is still a name we wish to tolerate;
657: // simply continue
658: } else {
659: // problematic domain: neither ASCII acceptable characters
660: // nor IDN-punycodable, so throw exception
661: // TODO: change to HeritrixURIException so distinguishable
662: // from URIExceptions in library code
663: URIException ue = new URIException(e + " " + label);
664: ue.initCause(e);
665: throw ue;
666: }
667: }
668: label = label.toLowerCase();
669: return label;
670: }
671:
672: /**
673: * Ensure that there all characters needing escaping
674: * in the passed-in String are escaped. Stray '%' characters
675: * are *not* escaped, as per browser behavior.
676: *
677: * @param u String to escape
678: * @param charset
679: * @return string with any necessary escaping applied
680: */
681: private String ensureMinimalEscaping(String u, final String charset) {
682: return ensureMinimalEscaping(u, charset,
683: LaxURLCodec.EXPANDED_URI_SAFE);
684: }
685:
686: /**
687: * Ensure that there all characters needing escaping
688: * in the passed-in String are escaped. Stray '%' characters
689: * are *not* escaped, as per browser behavior.
690: *
691: * @param u String to escape
692: * @param charset
693: * @param bitset
694: * @return string with any necessary escaping applied
695: */
696: private String ensureMinimalEscaping(String u,
697: final String charset, final BitSet bitset) {
698: if (u == null) {
699: return null;
700: }
701: for (int i = 0; i < u.length(); i++) {
702: char c = u.charAt(i);
703: if (!bitset.get(c)) {
704: try {
705: u = LaxURLCodec.DEFAULT.encode(bitset, u, charset);
706: } catch (UnsupportedEncodingException e) {
707: e.printStackTrace();
708: }
709: break;
710: }
711: }
712: return u;
713: }
714:
715: /**
716: * Escape any whitespace found.
717: *
718: * The parent class takes care of the bulk of escaping. But if any
719: * instance of escaping is found in the URI, then we ask for parent
720: * to do NO escaping. Here we escape any whitespace found irrespective
721: * of whether the uri has already been escaped. We do this for
722: * case where uri has been judged already-escaped only, its been
723: * incompletly done and whitespace remains. Spaces, etc., in the URI are
724: * a real pain. Their presence will break log file and ARC parsing.
725: * @param uri URI string to check.
726: * @return uri with spaces escaped if any found.
727: */
728: protected String escapeWhitespace(String uri) {
729: // Just write a new string anyways. The perl '\s' is not
730: // as inclusive as the Character.isWhitespace so there are
731: // whitespace characters we could miss. So, rather than
732: // write some awkward regex, just go through the string
733: // a character at a time. Only create buffer first time
734: // we find a space.
735: MutableString buffer = null;
736: for (int i = 0; i < uri.length(); i++) {
737: char c = uri.charAt(i);
738: if (Character.isWhitespace(c)) {
739: if (buffer == null) {
740: buffer = new MutableString(uri.length() + 2 /*If space, two extra characters (at least)*/);
741: buffer.append(uri.substring(0, i));
742: }
743: buffer.append("%");
744: String hexStr = Integer.toHexString(c);
745: if ((hexStr.length() % 2) > 0) {
746: buffer.append("0");
747: }
748: buffer.append(hexStr);
749:
750: } else {
751: if (buffer != null) {
752: buffer.append(c);
753: }
754: }
755: }
756: return (buffer != null) ? buffer.toString() : uri;
757: }
758:
759: /**
760: * Check port on passed http authority. Make sure the size is not larger
761: * than allowed: See the 'port' definition on this
762: * page, http://www.kerio.com/manual/wrp/en/418.htm.
763: * Also, we've seen port numbers of '0080' whose leading zeros confuse
764: * the parent class. Strip the leading zeros.
765: *
766: * @param uriAuthority
767: * @return Null or an amended port number.
768: * @throws URIException
769: */
770: private String checkPort(String uriAuthority) throws URIException {
771: Matcher m = PORTREGEX.matcher(uriAuthority);
772: if (m.matches()) {
773: String no = m.group(2);
774: if (no != null && no.length() > 0) {
775: // First check if the port has leading zeros
776: // as in '0080'. Strip them if it has and
777: // then reconstitute the uriAuthority. Be careful
778: // of cases where port is '0' or '000'.
779: while (no.charAt(0) == '0' && no.length() > 1) {
780: no = no.substring(1);
781: }
782: uriAuthority = m.group(1) + no;
783: // Now makesure the number is legit.
784: int portNo = Integer.parseInt(no);
785: if (portNo <= 0 || portNo > 65535) {
786: throw new URIException("Port out of bounds: "
787: + uriAuthority);
788: }
789: }
790: }
791: return uriAuthority;
792: }
793:
794: /**
795: * @param b Buffer to append to.
796: * @param str String to append if not null.
797: * @param substr Suffix or prefix to use if <code>str</code> is not null.
798: * @param suffix True if <code>substr</code> is a suffix.
799: */
800: private void appendNonNull(MutableString b, String str,
801: String substr, boolean suffix) {
802: if (str != null && str.length() > 0) {
803: if (!suffix) {
804: b.append(substr);
805: }
806: b.append(str);
807: if (suffix) {
808: b.append(substr);
809: }
810: }
811: }
812:
813: /**
814: * @param str String to work on.
815: * @param prefix Prefix to strip if present.
816: * @return <code>str</code> w/o <code>prefix</code>.
817: */
818: private String stripPrefix(String str, String prefix) {
819: return str.startsWith(prefix) ? str.substring(prefix.length(),
820: str.length()) : str;
821: }
822:
823: /**
824: * @param str String to work on.
825: * @param tail Tail to strip if present.
826: * @return <code>str</code> w/o <code>tail</code>.
827: */
828: private static String stripTail(String str, String tail) {
829: return str.endsWith(tail) ? str.substring(0, str.length()
830: - tail.length()) : str;
831: }
832:
833: /**
834: * @param element to examine.
835: * @return Null if passed null or an empty string otherwise
836: * <code>element</code>.
837: */
838: private String checkUriElement(String element) {
839: return (element == null || element.length() <= 0) ? null
840: : element;
841: }
842:
843: /**
844: * @param element to examine and lowercase if non-null.
845: * @return Null if passed null or an empty string otherwise
846: * <code>element</code> lowercased.
847: */
848: private String checkUriElementAndLowerCase(String element) {
849: String tmp = checkUriElement(element);
850: return (tmp != null) ? tmp.toLowerCase() : tmp;
851: }
852: }
|