Source Code Cross Referenced for UURI.java in  » Web-Crawler » heritrix » org » archive » net » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.net 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* UURI
002:         *
003:         * $Id: UURI.java 4646 2006-09-22 17:23:04Z paul_jack $
004:         *
005:         * Created on Apr 18, 2003
006:         *
007:         * Copyright (C) 2003 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.net;
026:
027:        import java.io.File;
028:        import java.io.Serializable;
029:        import java.net.URI;
030:        import java.net.URISyntaxException;
031:        import java.util.logging.Level;
032:        import java.util.logging.Logger;
033:
034:        import org.apache.commons.httpclient.URIException;
035:        import org.archive.crawler.datamodel.CandidateURI;
036:        import org.archive.util.SURT;
037:        import org.archive.util.TextUtils;
038:
039:        /**
040:         * Usable URI.
041:         * 
042:         * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
043:         * and methods. It cannot be instantiated directly.  Go via UURIFactory.
044:         * 
045:         *  <p>We used to use {@link java.net.URI} for parsing URIs but ran across
046:         * quirky behaviors and bugs.  {@link java.net.URI} is not subclassable --
047:         * its final -- and its unlikely that java.net.URI will change any time soon
048:         * (See Gordon's considered petition here:
049:         * <a href="http://developer.java.sun.com/developer/bugParade/bugs/4939847.html">java.net.URI
050:         * should have loose/tolerant/compatibility option (or allow reuse)</a>).
051:         *
052:         * <p>This class tries to cache calculated strings such as the extracted host
053:         * and this class as a string rather than have the parent class rerun its
054:         * calculation everytime.
055:         *
056:         * @author gojomo
057:         * @author stack
058:         *
059:         * @see org.apache.commons.httpclient.URI
060:         */
061:        public class UURI extends LaxURI implements  CharSequence, Serializable {
062:
063:            private static final long serialVersionUID = -1277570889914647093L;
064:
065:            private static Logger LOGGER = Logger.getLogger(UURI.class
066:                    .getName());
067:
068:            /**
069:             * Consider URIs too long for IE as illegal.
070:             */
071:            public final static int MAX_URL_LENGTH = 2083;
072:
073:            public static final String MASSAGEHOST_PATTERN = "^www\\d*\\.";
074:
075:            /**
076:             * Cache of the host name.
077:             *
078:             * Super class calculates on every call.  Profiling shows us spend 30% of
079:             * total elapsed time in URI class.
080:             */
081:            private transient String cachedHost = null;
082:
083:            /**
084:             * Cache of this uuri escaped as a string.
085:             *
086:             * Super class calculates on every call.  Profiling shows us spend 30% of
087:             * total elapsed time in URI class.
088:             */
089:            private transient String cachedEscapedURI = null;
090:
091:            /**
092:             * Cache of this uuri escaped as a string.
093:             *
094:             * Super class calculates on every call.  Profiling shows us spend 30% of
095:             * total elapsed time in URI class.
096:             */
097:            private transient String cachedString = null;
098:
099:            /**
100:             * Cached authority minus userinfo.
101:             */
102:            private transient String cachedAuthorityMinusUserinfo = null;
103:
104:            /**
105:             * Cache of this uuri in SURT format
106:             */
107:            private transient String surtForm = null;
108:
109:            // Technically, underscores are disallowed in the domainlabel
110:            // portion of hostname according to rfc2396 but we'll be more
111:            // loose and allow them. See: [ 1072035 ] [uuri] Underscore in
112:            // host messes up port parsing.
113:            static {
114:                hostname.set('_');
115:            }
116:
117:            /**
118:             * Shutdown access to default constructor.
119:             */
120:            protected UURI() {
121:                super ();
122:            }
123:
124:            /**
125:             * @param uri String representation of an absolute URI.
126:             * @param escaped If escaped.
127:             * @param charset Charset to use.
128:             * @throws org.apache.commons.httpclient.URIException
129:             */
130:            protected UURI(String uri, boolean escaped, String charset)
131:                    throws URIException {
132:                super (uri, escaped, charset);
133:                normalize();
134:            }
135:
136:            /**
137:             * @param relative String representation of URI.
138:             * @param base Parent UURI to use derelativizing.
139:             * @throws org.apache.commons.httpclient.URIException
140:             */
141:            protected UURI(UURI base, UURI relative) throws URIException {
142:                super (base, relative);
143:                normalize();
144:            }
145:
146:            /**
147:             * @param uri String representation of a URI.
148:             * @param escaped If escaped.
149:             * @throws NullPointerException
150:             * @throws URIException
151:             */
152:            public UURI(String uri, boolean escaped) throws URIException,
153:                    NullPointerException {
154:                super (uri, escaped);
155:                normalize();
156:            }
157:
158:            /**
159:             * @param uri URI as string that is resolved relative to this UURI.
160:             * @return UURI that uses this UURI as base.
161:             * @throws URIException
162:             */
163:            public UURI resolve(String uri) throws URIException {
164:                return resolve(uri, false, // assume not escaped
165:                        this .getProtocolCharset());
166:            }
167:
168:            /**
169:             * @param uri URI as string that is resolved relative to this UURI.
170:             * @param e True if escaped.
171:             * @return UURI that uses this UURI as base.
172:             * @throws URIException
173:             */
174:            public UURI resolve(String uri, boolean e) throws URIException {
175:                return resolve(uri, e, this .getProtocolCharset());
176:            }
177:
178:            /**
179:             * @param uri URI as string that is resolved relative to this UURI.
180:             * @param e True if uri is escaped.
181:             * @param charset Charset to use.
182:             * @return UURI that uses this UURI as base.
183:             * @throws URIException
184:             */
185:            public UURI resolve(String uri, boolean e, String charset)
186:                    throws URIException {
187:                return new UURI(this , new UURI(uri, e, charset));
188:            }
189:
190:            /**
191:             * Test an object if this UURI is equal to another.
192:             *
193:             * @param obj an object to compare
194:             * @return true if two URI objects are equal
195:             */
196:            public boolean equals(Object obj) {
197:
198:                // normalize and test each components
199:                if (obj == this ) {
200:                    return true;
201:                }
202:                if (!(obj instanceof  UURI)) {
203:                    return false;
204:                }
205:                UURI another = (UURI) obj;
206:                // scheme
207:                if (!equals(this ._scheme, another._scheme)) {
208:                    return false;
209:                }
210:                // is_opaque_part or is_hier_part?  and opaque
211:                if (!equals(this ._opaque, another._opaque)) {
212:                    return false;
213:                }
214:                // is_hier_part
215:                // has_authority
216:                if (!equals(this ._authority, another._authority)) {
217:                    return false;
218:                }
219:                // path
220:                if (!equals(this ._path, another._path)) {
221:                    return false;
222:                }
223:                // has_query
224:                if (!equals(this ._query, another._query)) {
225:                    return false;
226:                }
227:                // UURIs do not have fragments
228:                return true;
229:            }
230:
231:            /**
232:             * Strips www variants from the host.
233:             *
234:             * Strips www[0-9]*\. from the host.  If calling getHostBaseName becomes a
235:             * performance issue we should consider adding the hostBasename member that
236:             * is set on initialization.
237:             *
238:             * @return Host's basename.
239:             * @throws URIException
240:             */
241:            public String getHostBasename() throws URIException {
242:                // caching eliminated because this is rarely used
243:                // (only benefits legacy DomainScope, which should
244:                // be retired). Saves 4-byte object pointer in UURI
245:                // instances.
246:                return (this .getReferencedHost() == null) ? null : TextUtils
247:                        .replaceFirst(MASSAGEHOST_PATTERN, this 
248:                                .getReferencedHost(), UURIFactory.EMPTY_STRING);
249:            }
250:
251:            /**
252:             * Override to cache result
253:             * 
254:             * @return String representation of this URI
255:             */
256:            public synchronized String toString() {
257:                if (this .cachedString == null) {
258:                    this .cachedString = super .toString();
259:                    coalesceUriStrings();
260:                }
261:                return this .cachedString;
262:            }
263:
264:            public synchronized String getEscapedURI() {
265:                if (this .cachedEscapedURI == null) {
266:                    this .cachedEscapedURI = super .getEscapedURI();
267:                    coalesceUriStrings();
268:                }
269:                return this .cachedEscapedURI;
270:            }
271:
272:            /**
273:             * The two String fields cachedString and cachedEscapedURI are 
274:             * usually identical; if so, coalesce into a single instance. 
275:             */
276:            protected void coalesceUriStrings() {
277:                if (this .cachedString != null
278:                        && this .cachedEscapedURI != null
279:                        && this .cachedString.length() == this .cachedEscapedURI
280:                                .length()) {
281:                    // lengths will only be identical if contents are identical
282:                    // (deescaping will always shrink length), so coalesce to
283:                    // use only single cached instance
284:                    this .cachedString = this .cachedEscapedURI;
285:                }
286:            }
287:
288:            public synchronized String getHost() throws URIException {
289:                if (this .cachedHost == null) {
290:                    // If this._host is null, 3.0 httpclient throws
291:                    // illegalargumentexception.  Don't go there.
292:                    if (this ._host != null) {
293:                        this .cachedHost = super .getHost();
294:                        coalesceHostAuthorityStrings();
295:                    }
296:                }
297:                return this .cachedHost;
298:            }
299:
300:            /**
301:             * The two String fields cachedHost and cachedAuthorityMinusUserInfo are 
302:             * usually identical; if so, coalesce into a single instance. 
303:             */
304:            protected void coalesceHostAuthorityStrings() {
305:                if (this .cachedAuthorityMinusUserinfo != null
306:                        && this .cachedHost != null
307:                        && this .cachedHost.length() == this .cachedAuthorityMinusUserinfo
308:                                .length()) {
309:                    // lengths can only be identical if contents
310:                    // are identical; use only one instance
311:                    this .cachedAuthorityMinusUserinfo = this .cachedHost;
312:                }
313:            }
314:
315:            /**
316:             * Return the referenced host in the UURI, if any, also extracting the 
317:             * host of a DNS-lookup URI where necessary. 
318:             * 
319:             * @return the target or topic host of the URI
320:             * @throws URIException
321:             */
322:            public String getReferencedHost() throws URIException {
323:                String referencedHost = this .getHost();
324:                if (referencedHost == null && this .getScheme().equals("dns")) {
325:                    // extract target domain of DNS lookup
326:                    String possibleHost = this .getCurrentHierPath();
327:                    if (possibleHost != null
328:                            && possibleHost.matches("[-_\\w\\.:]+")) {
329:                        referencedHost = possibleHost;
330:                    }
331:                }
332:                return referencedHost;
333:            }
334:
335:            /**
336:             * @return Return the 'SURT' format of this UURI
337:             */
338:            public String getSurtForm() {
339:                if (surtForm == null) {
340:                    surtForm = SURT.fromURI(this .toString());
341:                }
342:                return surtForm;
343:            }
344:
345:            /**
346:             * Return the authority minus userinfo (if any).
347:             * 
348:             * If no userinfo present, just returns the authority.
349:             * 
350:             * @return The authority stripped of any userinfo if present.
351:             * @throws URIException
352:             */
353:            public String getAuthorityMinusUserinfo() throws URIException {
354:                if (this .cachedAuthorityMinusUserinfo == null) {
355:                    String tmp = getAuthority();
356:                    if (tmp != null && tmp.length() > 0) {
357:                        int index = tmp.indexOf('@');
358:                        if (index >= 0 && index < tmp.length()) {
359:                            tmp = tmp.substring(index + 1);
360:                        }
361:                    }
362:                    this .cachedAuthorityMinusUserinfo = tmp;
363:                    coalesceHostAuthorityStrings();
364:                }
365:                return this .cachedAuthorityMinusUserinfo;
366:            }
367:
368:            /* (non-Javadoc)
369:             * @see java.lang.CharSequence#length()
370:             */
371:            public int length() {
372:                return getEscapedURI().length();
373:            }
374:
375:            /* (non-Javadoc)
376:             * @see java.lang.CharSequence#charAt(int)
377:             */
378:            public char charAt(int index) {
379:                return getEscapedURI().charAt(index);
380:            }
381:
382:            /* (non-Javadoc)
383:             * @see java.lang.CharSequence#subSequence(int, int)
384:             */
385:            public CharSequence subSequence(int start, int end) {
386:                return getEscapedURI().subSequence(start, end);
387:            }
388:
389:            /* (non-Javadoc)
390:             * @see java.lang.Comparable#compareTo(java.lang.Object)
391:             */
392:            public int compareTo(Object arg0) {
393:                return getEscapedURI().compareTo(arg0.toString());
394:            }
395:
396:            /**
397:             * Convenience method for finding the UURI inside an
398:             * Object likely to have (or be/imply) one.
399:             * 
400:             * @param o Object that is, has, or implies a UURI
401:             * @return the UURI found, or null if none
402:             */
403:            public static UURI from(Object o) {
404:                UURI u = null;
405:                if (o instanceof  UURI) {
406:                    u = (UURI) o;
407:                } else if (o instanceof  CandidateURI) {
408:                    u = ((CandidateURI) o).getUURI();
409:                } else if (o instanceof  CharSequence) {
410:                    String s = o.toString();
411:                    try {
412:                        u = UURIFactory.getInstance(s);
413:                    } catch (URIException e) {
414:                        LOGGER.log(Level.FINE, "bad URI", e);
415:                    }
416:                }
417:                return u;
418:            }
419:
420:            /**
421:             * Test if passed String has likely URI scheme prefix.
422:             * @param possibleUrl URL string to examine.
423:             * @return True if passed string looks like it could be an URL.
424:             */
425:            public static boolean hasScheme(String possibleUrl) {
426:                boolean result = false;
427:                for (int i = 0; i < possibleUrl.length(); i++) {
428:                    char c = possibleUrl.charAt(i);
429:                    if (c == ':') {
430:                        if (i != 0) {
431:                            result = true;
432:                        }
433:                        break;
434:                    }
435:                    if (!scheme.get(c)) {
436:                        break;
437:                    }
438:                }
439:                return result;
440:            }
441:
442:            /**
443:             * @param pathOrUri A file path or a URI.
444:             * @return Path parsed from passed <code>pathOrUri</code>.
445:             * @throws URISyntaxException
446:             */
447:            public static String parseFilename(final String pathOrUri)
448:                    throws URISyntaxException {
449:                String path = pathOrUri;
450:                if (UURI.hasScheme(pathOrUri)) {
451:                    URI url = new URI(pathOrUri);
452:                    path = url.getPath();
453:                }
454:                return (new File(path)).getName();
455:            }
456:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.