001: /* UURI
002: *
003: * $Id: UURI.java 4646 2006-09-22 17:23:04Z paul_jack $
004: *
005: * Created on Apr 18, 2003
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.net;
026:
027: import java.io.File;
028: import java.io.Serializable;
029: import java.net.URI;
030: import java.net.URISyntaxException;
031: import java.util.logging.Level;
032: import java.util.logging.Logger;
033:
034: import org.apache.commons.httpclient.URIException;
035: import org.archive.crawler.datamodel.CandidateURI;
036: import org.archive.util.SURT;
037: import org.archive.util.TextUtils;
038:
039: /**
040: * Usable URI.
041: *
042: * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
043: * and methods. It cannot be instantiated directly. Go via UURIFactory.
044: *
045: * <p>We used to use {@link java.net.URI} for parsing URIs but ran across
046: * quirky behaviors and bugs. {@link java.net.URI} is not subclassable --
047: * its final -- and its unlikely that java.net.URI will change any time soon
048: * (See Gordon's considered petition here:
049: * <a href="http://developer.java.sun.com/developer/bugParade/bugs/4939847.html">java.net.URI
050: * should have loose/tolerant/compatibility option (or allow reuse)</a>).
051: *
052: * <p>This class tries to cache calculated strings such as the extracted host
053: * and this class as a string rather than have the parent class rerun its
054: * calculation everytime.
055: *
056: * @author gojomo
057: * @author stack
058: *
059: * @see org.apache.commons.httpclient.URI
060: */
061: public class UURI extends LaxURI implements CharSequence, Serializable {
062:
063: private static final long serialVersionUID = -1277570889914647093L;
064:
065: private static Logger LOGGER = Logger.getLogger(UURI.class
066: .getName());
067:
068: /**
069: * Consider URIs too long for IE as illegal.
070: */
071: public final static int MAX_URL_LENGTH = 2083;
072:
073: public static final String MASSAGEHOST_PATTERN = "^www\\d*\\.";
074:
075: /**
076: * Cache of the host name.
077: *
078: * Super class calculates on every call. Profiling shows us spend 30% of
079: * total elapsed time in URI class.
080: */
081: private transient String cachedHost = null;
082:
083: /**
084: * Cache of this uuri escaped as a string.
085: *
086: * Super class calculates on every call. Profiling shows us spend 30% of
087: * total elapsed time in URI class.
088: */
089: private transient String cachedEscapedURI = null;
090:
091: /**
092: * Cache of this uuri escaped as a string.
093: *
094: * Super class calculates on every call. Profiling shows us spend 30% of
095: * total elapsed time in URI class.
096: */
097: private transient String cachedString = null;
098:
099: /**
100: * Cached authority minus userinfo.
101: */
102: private transient String cachedAuthorityMinusUserinfo = null;
103:
104: /**
105: * Cache of this uuri in SURT format
106: */
107: private transient String surtForm = null;
108:
109: // Technically, underscores are disallowed in the domainlabel
110: // portion of hostname according to rfc2396 but we'll be more
111: // loose and allow them. See: [ 1072035 ] [uuri] Underscore in
112: // host messes up port parsing.
113: static {
114: hostname.set('_');
115: }
116:
117: /**
118: * Shutdown access to default constructor.
119: */
120: protected UURI() {
121: super ();
122: }
123:
124: /**
125: * @param uri String representation of an absolute URI.
126: * @param escaped If escaped.
127: * @param charset Charset to use.
128: * @throws org.apache.commons.httpclient.URIException
129: */
130: protected UURI(String uri, boolean escaped, String charset)
131: throws URIException {
132: super (uri, escaped, charset);
133: normalize();
134: }
135:
136: /**
137: * @param relative String representation of URI.
138: * @param base Parent UURI to use derelativizing.
139: * @throws org.apache.commons.httpclient.URIException
140: */
141: protected UURI(UURI base, UURI relative) throws URIException {
142: super (base, relative);
143: normalize();
144: }
145:
146: /**
147: * @param uri String representation of a URI.
148: * @param escaped If escaped.
149: * @throws NullPointerException
150: * @throws URIException
151: */
152: public UURI(String uri, boolean escaped) throws URIException,
153: NullPointerException {
154: super (uri, escaped);
155: normalize();
156: }
157:
158: /**
159: * @param uri URI as string that is resolved relative to this UURI.
160: * @return UURI that uses this UURI as base.
161: * @throws URIException
162: */
163: public UURI resolve(String uri) throws URIException {
164: return resolve(uri, false, // assume not escaped
165: this .getProtocolCharset());
166: }
167:
168: /**
169: * @param uri URI as string that is resolved relative to this UURI.
170: * @param e True if escaped.
171: * @return UURI that uses this UURI as base.
172: * @throws URIException
173: */
174: public UURI resolve(String uri, boolean e) throws URIException {
175: return resolve(uri, e, this .getProtocolCharset());
176: }
177:
178: /**
179: * @param uri URI as string that is resolved relative to this UURI.
180: * @param e True if uri is escaped.
181: * @param charset Charset to use.
182: * @return UURI that uses this UURI as base.
183: * @throws URIException
184: */
185: public UURI resolve(String uri, boolean e, String charset)
186: throws URIException {
187: return new UURI(this , new UURI(uri, e, charset));
188: }
189:
190: /**
191: * Test an object if this UURI is equal to another.
192: *
193: * @param obj an object to compare
194: * @return true if two URI objects are equal
195: */
196: public boolean equals(Object obj) {
197:
198: // normalize and test each components
199: if (obj == this ) {
200: return true;
201: }
202: if (!(obj instanceof UURI)) {
203: return false;
204: }
205: UURI another = (UURI) obj;
206: // scheme
207: if (!equals(this ._scheme, another._scheme)) {
208: return false;
209: }
210: // is_opaque_part or is_hier_part? and opaque
211: if (!equals(this ._opaque, another._opaque)) {
212: return false;
213: }
214: // is_hier_part
215: // has_authority
216: if (!equals(this ._authority, another._authority)) {
217: return false;
218: }
219: // path
220: if (!equals(this ._path, another._path)) {
221: return false;
222: }
223: // has_query
224: if (!equals(this ._query, another._query)) {
225: return false;
226: }
227: // UURIs do not have fragments
228: return true;
229: }
230:
231: /**
232: * Strips www variants from the host.
233: *
234: * Strips www[0-9]*\. from the host. If calling getHostBaseName becomes a
235: * performance issue we should consider adding the hostBasename member that
236: * is set on initialization.
237: *
238: * @return Host's basename.
239: * @throws URIException
240: */
241: public String getHostBasename() throws URIException {
242: // caching eliminated because this is rarely used
243: // (only benefits legacy DomainScope, which should
244: // be retired). Saves 4-byte object pointer in UURI
245: // instances.
246: return (this .getReferencedHost() == null) ? null : TextUtils
247: .replaceFirst(MASSAGEHOST_PATTERN, this
248: .getReferencedHost(), UURIFactory.EMPTY_STRING);
249: }
250:
251: /**
252: * Override to cache result
253: *
254: * @return String representation of this URI
255: */
256: public synchronized String toString() {
257: if (this .cachedString == null) {
258: this .cachedString = super .toString();
259: coalesceUriStrings();
260: }
261: return this .cachedString;
262: }
263:
264: public synchronized String getEscapedURI() {
265: if (this .cachedEscapedURI == null) {
266: this .cachedEscapedURI = super .getEscapedURI();
267: coalesceUriStrings();
268: }
269: return this .cachedEscapedURI;
270: }
271:
272: /**
273: * The two String fields cachedString and cachedEscapedURI are
274: * usually identical; if so, coalesce into a single instance.
275: */
276: protected void coalesceUriStrings() {
277: if (this .cachedString != null
278: && this .cachedEscapedURI != null
279: && this .cachedString.length() == this .cachedEscapedURI
280: .length()) {
281: // lengths will only be identical if contents are identical
282: // (deescaping will always shrink length), so coalesce to
283: // use only single cached instance
284: this .cachedString = this .cachedEscapedURI;
285: }
286: }
287:
288: public synchronized String getHost() throws URIException {
289: if (this .cachedHost == null) {
290: // If this._host is null, 3.0 httpclient throws
291: // illegalargumentexception. Don't go there.
292: if (this ._host != null) {
293: this .cachedHost = super .getHost();
294: coalesceHostAuthorityStrings();
295: }
296: }
297: return this .cachedHost;
298: }
299:
300: /**
301: * The two String fields cachedHost and cachedAuthorityMinusUserInfo are
302: * usually identical; if so, coalesce into a single instance.
303: */
304: protected void coalesceHostAuthorityStrings() {
305: if (this .cachedAuthorityMinusUserinfo != null
306: && this .cachedHost != null
307: && this .cachedHost.length() == this .cachedAuthorityMinusUserinfo
308: .length()) {
309: // lengths can only be identical if contents
310: // are identical; use only one instance
311: this .cachedAuthorityMinusUserinfo = this .cachedHost;
312: }
313: }
314:
315: /**
316: * Return the referenced host in the UURI, if any, also extracting the
317: * host of a DNS-lookup URI where necessary.
318: *
319: * @return the target or topic host of the URI
320: * @throws URIException
321: */
322: public String getReferencedHost() throws URIException {
323: String referencedHost = this .getHost();
324: if (referencedHost == null && this .getScheme().equals("dns")) {
325: // extract target domain of DNS lookup
326: String possibleHost = this .getCurrentHierPath();
327: if (possibleHost != null
328: && possibleHost.matches("[-_\\w\\.:]+")) {
329: referencedHost = possibleHost;
330: }
331: }
332: return referencedHost;
333: }
334:
335: /**
336: * @return Return the 'SURT' format of this UURI
337: */
338: public String getSurtForm() {
339: if (surtForm == null) {
340: surtForm = SURT.fromURI(this .toString());
341: }
342: return surtForm;
343: }
344:
345: /**
346: * Return the authority minus userinfo (if any).
347: *
348: * If no userinfo present, just returns the authority.
349: *
350: * @return The authority stripped of any userinfo if present.
351: * @throws URIException
352: */
353: public String getAuthorityMinusUserinfo() throws URIException {
354: if (this .cachedAuthorityMinusUserinfo == null) {
355: String tmp = getAuthority();
356: if (tmp != null && tmp.length() > 0) {
357: int index = tmp.indexOf('@');
358: if (index >= 0 && index < tmp.length()) {
359: tmp = tmp.substring(index + 1);
360: }
361: }
362: this .cachedAuthorityMinusUserinfo = tmp;
363: coalesceHostAuthorityStrings();
364: }
365: return this .cachedAuthorityMinusUserinfo;
366: }
367:
368: /* (non-Javadoc)
369: * @see java.lang.CharSequence#length()
370: */
371: public int length() {
372: return getEscapedURI().length();
373: }
374:
375: /* (non-Javadoc)
376: * @see java.lang.CharSequence#charAt(int)
377: */
378: public char charAt(int index) {
379: return getEscapedURI().charAt(index);
380: }
381:
382: /* (non-Javadoc)
383: * @see java.lang.CharSequence#subSequence(int, int)
384: */
385: public CharSequence subSequence(int start, int end) {
386: return getEscapedURI().subSequence(start, end);
387: }
388:
389: /* (non-Javadoc)
390: * @see java.lang.Comparable#compareTo(java.lang.Object)
391: */
392: public int compareTo(Object arg0) {
393: return getEscapedURI().compareTo(arg0.toString());
394: }
395:
396: /**
397: * Convenience method for finding the UURI inside an
398: * Object likely to have (or be/imply) one.
399: *
400: * @param o Object that is, has, or implies a UURI
401: * @return the UURI found, or null if none
402: */
403: public static UURI from(Object o) {
404: UURI u = null;
405: if (o instanceof UURI) {
406: u = (UURI) o;
407: } else if (o instanceof CandidateURI) {
408: u = ((CandidateURI) o).getUURI();
409: } else if (o instanceof CharSequence) {
410: String s = o.toString();
411: try {
412: u = UURIFactory.getInstance(s);
413: } catch (URIException e) {
414: LOGGER.log(Level.FINE, "bad URI", e);
415: }
416: }
417: return u;
418: }
419:
420: /**
421: * Test if passed String has likely URI scheme prefix.
422: * @param possibleUrl URL string to examine.
423: * @return True if passed string looks like it could be an URL.
424: */
425: public static boolean hasScheme(String possibleUrl) {
426: boolean result = false;
427: for (int i = 0; i < possibleUrl.length(); i++) {
428: char c = possibleUrl.charAt(i);
429: if (c == ':') {
430: if (i != 0) {
431: result = true;
432: }
433: break;
434: }
435: if (!scheme.get(c)) {
436: break;
437: }
438: }
439: return result;
440: }
441:
442: /**
443: * @param pathOrUri A file path or a URI.
444: * @return Path parsed from passed <code>pathOrUri</code>.
445: * @throws URISyntaxException
446: */
447: public static String parseFilename(final String pathOrUri)
448: throws URISyntaxException {
449: String path = pathOrUri;
450: if (UURI.hasScheme(pathOrUri)) {
451: URI url = new URI(pathOrUri);
452: path = url.getPath();
453: }
454: return (new File(path)).getName();
455: }
456: }
|