Source Code Cross Referenced for UURIFactory.java in  » Web-Crawler » heritrix » org » archive » net » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.net 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* UURIFactory
002:         *
003:         * $Id: UURIFactory.java 5106 2007-05-01 00:07:29Z gojomo $
004:         *
005:         * Created on July 16, 2004
006:         *
007:         * Copyright (C) 2003 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.net;
026:
027:        import gnu.inet.encoding.IDNA;
028:        import gnu.inet.encoding.IDNAException;
029:        import it.unimi.dsi.mg4j.util.MutableString;
030:
031:        import java.io.UnsupportedEncodingException;
032:        import java.util.Arrays;
033:        import java.util.BitSet;
034:        import java.util.logging.Level;
035:        import java.util.logging.Logger;
036:        import java.util.regex.Matcher;
037:        import java.util.regex.Pattern;
038:
039:        import org.apache.commons.httpclient.URI;
040:        import org.apache.commons.httpclient.URIException;
041:        import org.archive.util.TextUtils;
042:
043:        /**
044:         * Factory that returns UURIs.
045:         * 
046:         * Does escaping and fixup on URIs massaging in accordance with RFC2396
047:         * and to match browser practice. For example, it removes any
048:         * '..' if first thing in the path as per IE,  converts backslashes to forward
049:         * slashes, and discards any 'fragment'/anchor portion of the URI. This
050:         * class will also fail URIs if they are longer than IE's allowed maximum
051:         * length.
052:         * 
053:         * <p>TODO: Test logging.
054:         * 
055:         * @author stack
056:         */
057:        public class UURIFactory extends URI {
058:
059:            private static final long serialVersionUID = -6146295130382209042L;
060:
061:            /**
062:             * Logging instance.
063:             */
064:            private static Logger logger = Logger.getLogger(UURIFactory.class
065:                    .getName());
066:
067:            /**
068:             * The single instance of this factory.
069:             */
070:            private static final UURIFactory factory = new UURIFactory();
071:
072:            /**
073:             * RFC 2396-inspired regex.
074:             *
075:             * From the RFC Appendix B:
076:             * <pre>
077:             * URI Generic Syntax                August 1998
078:             *
079:             * B. Parsing a URI Reference with a Regular Expression
080:             *
081:             * As described in Section 4.3, the generic URI syntax is not sufficient
082:             * to disambiguate the components of some forms of URI.  Since the
083:             * "greedy algorithm" described in that section is identical to the
084:             * disambiguation method used by POSIX regular expressions, it is
085:             * natural and commonplace to use a regular expression for parsing the
086:             * potential four components and fragment identifier of a URI reference.
087:             *
088:             * The following line is the regular expression for breaking-down a URI
089:             * reference into its components.
090:             *
091:             * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
092:             * 12            3  4          5       6  7        8 9
093:             *
094:             * The numbers in the second line above are only to assist readability;
095:             * they indicate the reference points for each subexpression (i.e., each
096:             * paired parenthesis).  We refer to the value matched for subexpression
097:             * <n> as $<n>.  For example, matching the above expression to
098:             *
099:             * http://www.ics.uci.edu/pub/ietf/uri/#Related
100:             *
101:             * results in the following subexpression matches:
102:             *
103:             * $1 = http:
104:             * $2 = http
105:             * $3 = //www.ics.uci.edu
106:             * $4 = www.ics.uci.edu
107:             * $5 = /pub/ietf/uri/
108:             * $6 = <undefined>
109:             * $7 = <undefined>
110:             * $8 = #Related
111:             * $9 = Related
112:             *
113:             * where <undefined> indicates that the component is not present, as is
114:             * the case for the query component in the above example.  Therefore, we
115:             * can determine the value of the four components and fragment as
116:             *
117:             * scheme    = $2
118:             * authority = $4
119:             * path      = $5
120:             * query     = $7
121:             * fragment  = $9
122:             * </pre>
123:             *
124:             * -- 
125:             * <p>Below differs from the rfc regex in that it has java escaping of
126:             * regex characters and we allow a URI made of a fragment only (Added extra
127:             * group so indexing is off by one after scheme).
128:             */
129:            final static Pattern RFC2396REGEX = Pattern
130:                    .compile("^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?");
131:            //    12            34  5          6       7   8          9 A
132:            //              2 1             54        6          87 3      A9
133:            // 1: scheme
134:            // 2: scheme:
135:            // 3: //authority/path
136:            // 4: //authority
137:            // 5: authority
138:            // 6: path
139:            // 7: ?query
140:            // 8: query 
141:            // 9: #fragment
142:            // A: fragment
143:
144:            public static final String SLASHDOTDOTSLASH = "^(/\\.\\./)+";
145:            public static final String SLASH = "/";
146:            public static final String HTTP = "http";
147:            public static final String HTTP_PORT = ":80";
148:            public static final String HTTPS = "https";
149:            public static final String HTTPS_PORT = ":443";
150:            public static final String DOT = ".";
151:            public static final String EMPTY_STRING = "";
152:            public static final String NBSP = "\u00A0";
153:            public static final String SPACE = " ";
154:            public static final String ESCAPED_SPACE = "%20";
155:            public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$";
156:            public static final String PIPE = "|";
157:            public static final String PIPE_PATTERN = "\\|";
158:            public static final String ESCAPED_PIPE = "%7C";
159:            public static final String CIRCUMFLEX = "^";
160:            public static final String CIRCUMFLEX_PATTERN = "\\^";
161:            public static final String ESCAPED_CIRCUMFLEX = "%5E";
162:            public static final String QUOT = "\"";
163:            public static final String ESCAPED_QUOT = "%22";
164:            public static final String SQUOT = "'";
165:            public static final String ESCAPED_SQUOT = "%27";
166:            public static final String APOSTROPH = "`";
167:            public static final String ESCAPED_APOSTROPH = "%60";
168:            public static final String LSQRBRACKET = "[";
169:            public static final String LSQRBRACKET_PATTERN = "\\[";
170:            public static final String ESCAPED_LSQRBRACKET = "%5B";
171:            public static final String RSQRBRACKET = "]";
172:            public static final String RSQRBRACKET_PATTERN = "\\]";
173:            public static final String ESCAPED_RSQRBRACKET = "%5D";
174:            public static final String LCURBRACKET = "{";
175:            public static final String LCURBRACKET_PATTERN = "\\{";
176:            public static final String ESCAPED_LCURBRACKET = "%7B";
177:            public static final String RCURBRACKET = "}";
178:            public static final String RCURBRACKET_PATTERN = "\\}";
179:            public static final String ESCAPED_RCURBRACKET = "%7D";
180:            public static final String BACKSLASH = "\\";
181:            public static final String BACKSLASH_PATTERN = "\\\\";
182:            public static final String ESCAPED_BACKSLASH = "%5C";
183:            public static final String STRAY_SPACING = "[\n\r\t]+";
184:            public static final String IMPROPERESC_REPLACE = "%25$1";
185:            public static final String IMPROPERESC = "%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))";
186:            public static final String COMMERCIAL_AT = "@";
187:            public static final char PERCENT_SIGN = '%';
188:            public static final char COLON = ':';
189:
190:            /**
191:             * First percent sign in string followed by two hex chars.
192:             */
193:            public static final String URI_HEX_ENCODING = "^[^%]*%[\\p{XDigit}][\\p{XDigit}].*";
194:
195:            /**
196:             * Authority port number regex.
197:             */
198:            final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$");
199:
200:            /**
201:             * Characters we'll accept in the domain label part of a URI
202:             * authority: ASCII letters-digits-hyphen (LDH) plus underscore,
203:             * with single intervening '.' characters.
204:             * 
205:             * (We accept '_' because DNS servers have tolerated for many
206:             * years counter to spec; we also accept dash patterns and ACE
207:             * prefixes that will be rejected by IDN-punycoding attempt.)
208:             */
209:            final static String ACCEPTABLE_ASCII_DOMAIN = "^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$";
210:
211:            /**
212:             * Pattern that looks for case of three or more slashes after the 
213:             * scheme.  If found, we replace them with two only as mozilla does.
214:             */
215:            final static Pattern HTTP_SCHEME_SLASHES = Pattern
216:                    .compile("^(https?://)/+(.*)");
217:
218:            /**
219:             * Pattern that looks for case of two or more slashes in a path.
220:             */
221:            final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+");
222:
223:            /**
224:             * System property key for list of supported schemes.
225:             */
226:            private static final String SCHEMES_KEY = ".schemes";
227:
228:            /**
229:             * System property key for list of purposefully-ignored schemes.
230:             */
231:            private static final String IGNORED_SCHEMES_KEY = ".ignored-schemes";
232:
233:            private String[] schemes = null;
234:            private String[] ignoredSchemes = null;
235:
236:            public static final int IGNORED_SCHEME = 9999999;
237:
238:            /**
239:             * Protected constructor.
240:             */
241:            private UURIFactory() {
242:                super ();
243:                String s = System.getProperty(this .getClass().getName()
244:                        + SCHEMES_KEY);
245:                if (s != null && s.length() > 0) {
246:                    schemes = s.split("[, ]+");
247:                    Arrays.sort(schemes);
248:                }
249:                String ignored = System.getProperty(this .getClass().getName()
250:                        + IGNORED_SCHEMES_KEY);
251:                if (ignored != null && ignored.length() > 0) {
252:                    ignoredSchemes = ignored.split("[, ]+");
253:                    Arrays.sort(ignoredSchemes);
254:                }
255:            }
256:
257:            /**
258:             * @param uri URI as string.
259:             * @return An instance of UURI
260:             * @throws URIException
261:             */
262:            public static UURI getInstance(String uri) throws URIException {
263:                return UURIFactory.factory.create(uri);
264:            }
265:
266:            /**
267:             * @param uri URI as string.
268:             * @param charset Character encoding of the passed uri string.
269:             * @return An instance of UURI
270:             * @throws URIException
271:             */
272:            public static UURI getInstance(String uri, String charset)
273:                    throws URIException {
274:                return UURIFactory.factory.create(uri, charset);
275:            }
276:
277:            /**
278:             * @param base Base uri to use resolving passed relative uri.
279:             * @param relative URI as string.
280:             * @return An instance of UURI
281:             * @throws URIException
282:             */
283:            public static UURI getInstance(UURI base, String relative)
284:                    throws URIException {
285:                return UURIFactory.factory.create(base, relative);
286:            }
287:
288:            /**
289:             * Test of whether passed String has an allowed URI scheme.
290:             * First tests if likely scheme suffix.  If so, we then test if its one of
291:             * the supported schemes.
292:             * @param possibleUrl URL string to examine.
293:             * @return True if passed string looks like it could be an URL.
294:             */
295:            public static boolean hasSupportedScheme(String possibleUrl) {
296:                boolean hasScheme = UURI.hasScheme(possibleUrl);
297:                if (!hasScheme || UURIFactory.factory.schemes == null) {
298:                    return hasScheme;
299:                }
300:                String tmpStr = possibleUrl.substring(0, possibleUrl
301:                        .indexOf(':'));
302:                return Arrays.binarySearch(UURIFactory.factory.schemes, tmpStr) >= 0;
303:            }
304:
305:            /**
306:             * @param uri URI as string.
307:             * @return Instance of UURI.
308:             * @throws URIException
309:             */
310:            private UURI create(String uri) throws URIException {
311:                return create(uri, UURI.getDefaultProtocolCharset());
312:            }
313:
314:            /**
315:             * @param uri URI as string.
316:             * @param charset Original encoding of the string.
317:             * @return Instance of UURI.
318:             * @throws URIException
319:             */
320:            private UURI create(String uri, String charset) throws URIException {
321:                UURI uuri = new UURI(fixup(uri, null, charset), true, charset);
322:                if (logger.isLoggable(Level.FINE)) {
323:                    logger.fine("URI " + uri + " PRODUCT " + uuri.toString()
324:                            + " CHARSET " + charset);
325:                }
326:                return validityCheck(uuri);
327:            }
328:
329:            /**
330:             * @param base UURI to use as a base resolving <code>relative</code>.
331:             * @param relative Relative URI.
332:             * @return Instance of UURI.
333:             * @throws URIException
334:             */
335:            private UURI create(UURI base, String relative) throws URIException {
336:                UURI uuri = new UURI(base,
337:                        new UURI(fixup(relative, base, base
338:                                .getProtocolCharset()), true, base
339:                                .getProtocolCharset()));
340:                if (logger.isLoggable(Level.FINE)) {
341:                    logger.fine(" URI " + relative + " PRODUCT "
342:                            + uuri.toString() + " CHARSET "
343:                            + base.getProtocolCharset() + " BASE " + base);
344:                }
345:                return validityCheck(uuri);
346:            }
347:
348:            /**
349:             * Check the generated UURI.
350:             * 
351:             * At the least look at length of uuri string.  We were seeing case
352:             * where before escaping, string was &lt; MAX_URL_LENGTH but after was
353:             * &gt;.  Letting out a too-big message was causing us troubles later
354:             * down the processing chain.
355:             * @param uuri Created uuri to check.
356:             * @return The passed <code>uuri</code> so can easily inline this check.
357:             * @throws URIException
358:             */
359:            protected UURI validityCheck(UURI uuri) throws URIException {
360:                if (uuri.getRawURI().length > UURI.MAX_URL_LENGTH) {
361:                    throw new URIException("Created (escaped) uuri > "
362:                            + UURI.MAX_URL_LENGTH + ": " + uuri.toString());
363:                }
364:                return uuri;
365:            }
366:
367:            /**
368:             * Do heritrix fix-up on passed uri string.
369:             *
370:             * Does heritrix escaping; usually escaping done to make our behavior align
371:             * with IEs.  This method codifies our experience pulling URIs from the
372:             * wilds.  Its does all the escaping we want; its output can always be
373:             * assumed to be 'escaped' (though perhaps to a laxer standard than the 
374:             * vanilla HttpClient URI class or official specs might suggest). 
375:             *
376:             * @param uri URI as string.
377:             * @param base May be null.
378:             * @param e True if the uri is already escaped.
379:             * @return A fixed up URI string.
380:             * @throws URIException
381:             */
382:            private String fixup(String uri, final URI base,
383:                    final String charset) throws URIException {
384:                if (uri == null) {
385:                    throw new NullPointerException();
386:                } else if (uri.length() == 0 && base == null) {
387:                    throw new URIException(
388:                            "URI length is zero (and not relative).");
389:                }
390:
391:                if (uri.length() > UURI.MAX_URL_LENGTH) {
392:                    // We check length here and again later after all convertions.
393:                    throw new URIException("URI length > "
394:                            + UURI.MAX_URL_LENGTH + ": " + uri);
395:                }
396:
397:                // Replace nbsp with normal spaces (so that they get stripped if at
398:                // ends, or encoded if in middle)
399:                if (uri.indexOf(NBSP) >= 0) {
400:                    uri = TextUtils.replaceAll(NBSP, uri, SPACE);
401:                }
402:
403:                // Get rid of any trailing spaces or new-lines. 
404:                uri = uri.trim();
405:
406:                // IE actually converts backslashes to slashes rather than to %5C.
407:                // Since URIs that have backslashes usually work only with IE, we will
408:                // convert backslashes to slashes as well.
409:                // TODO: Maybe we can first convert backslashes by specs and than by IE
410:                // so that we fetch both versions.
411:                if (uri.indexOf(BACKSLASH) >= 0) {
412:                    uri = TextUtils.replaceAll(BACKSLASH_PATTERN, uri, SLASH);
413:                }
414:
415:                // Remove stray TAB/CR/LF
416:                uri = TextUtils.replaceAll(STRAY_SPACING, uri, EMPTY_STRING);
417:
418:                // Test for the case of more than two slashes after the http(s) scheme.
419:                // Replace with two slashes as mozilla does if found.
420:                // See [ 788219 ] URI Syntax Errors stop page parsing.
421:                Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri);
422:                if (matcher.matches()) {
423:                    uri = matcher.group(1) + matcher.group(2);
424:                }
425:
426:                // now, minimally escape any whitespace
427:                uri = escapeWhitespace(uri);
428:
429:                // For further processing, get uri elements.  See the RFC2396REGEX
430:                // comment above for explaination of group indices used in the below.
431:                matcher = RFC2396REGEX.matcher(uri);
432:                if (!matcher.matches()) {
433:                    throw new URIException("Failed parse of " + uri);
434:                }
435:                String uriScheme = checkUriElementAndLowerCase(matcher.group(2));
436:                String uriSchemeSpecificPart = checkUriElement(matcher.group(3));
437:                String uriAuthority = checkUriElement(matcher.group(5));
438:                String uriPath = checkUriElement(matcher.group(6));
439:                String uriQuery = checkUriElement(matcher.group(8));
440:                // UNUSED String uriFragment = checkUriElement(matcher.group(10));
441:
442:                // If a scheme, is it a supported scheme?
443:                if (uriScheme != null && uriScheme.length() > 0
444:                        && this .schemes != null) {
445:                    if (!(Arrays.binarySearch(schemes, uriScheme) >= 0)) {
446:                        // unsupported; see if silently ignored
447:                        if ((Arrays.binarySearch(ignoredSchemes, uriScheme) >= 0)) {
448:                            throw new URIException(IGNORED_SCHEME,
449:                                    "Ignored scheme: " + uriScheme);
450:                        } else {
451:                            throw new URIException("Unsupported scheme: "
452:                                    + uriScheme);
453:                        }
454:                    }
455:                }
456:
457:                // Test if relative URI. If so, need a base to resolve against.
458:                if (uriScheme == null || uriScheme.length() <= 0) {
459:                    if (base == null) {
460:                        throw new URIException("Relative URI but no base: "
461:                                + uri);
462:                    }
463:                } else {
464:                    checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme,
465:                            uriSchemeSpecificPart);
466:                }
467:
468:                // fixup authority portion: lowercase/IDN-punycode any domain; 
469:                // remove stray trailing spaces
470:                uriAuthority = fixupAuthority(uriAuthority);
471:
472:                // Do some checks if absolute path.
473:                if (uriSchemeSpecificPart != null
474:                        && uriSchemeSpecificPart.startsWith(SLASH)) {
475:                    if (uriPath != null) {
476:                        // Eliminate '..' if its first thing in the path.  IE does this.
477:                        uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH,
478:                                uriPath, SLASH);
479:                    }
480:                    // Ensure root URLs end with '/': browsers always send "/"
481:                    // on the request-line, so we should consider "http://host"
482:                    // to be "http://host/".
483:                    if (uriPath == null || EMPTY_STRING.equals(uriPath)) {
484:                        uriPath = SLASH;
485:                    }
486:                }
487:
488:                if (uriAuthority != null) {
489:                    if (uriScheme != null && uriScheme.length() > 0
490:                            && uriScheme.equals(HTTP)) {
491:                        uriAuthority = checkPort(uriAuthority);
492:                        uriAuthority = stripTail(uriAuthority, HTTP_PORT);
493:                    } else if (uriScheme != null && uriScheme.length() > 0
494:                            && uriScheme.equals(HTTPS)) {
495:                        uriAuthority = checkPort(uriAuthority);
496:                        uriAuthority = stripTail(uriAuthority, HTTPS_PORT);
497:                    }
498:                    // Strip any prefix dot or tail dots from the authority.
499:                    uriAuthority = stripTail(uriAuthority, DOT);
500:                    uriAuthority = stripPrefix(uriAuthority, DOT);
501:                } else {
502:                    // no authority; may be relative. consider stripping scheme
503:                    // to work-around org.apache.commons.httpclient.URI bug
504:                    // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 )
505:                    if (uriScheme != null && base != null
506:                            && uriScheme.equals(base.getScheme())) {
507:                        // uriScheme redundant and will only confound httpclient.URI
508:                        uriScheme = null;
509:                    }
510:                }
511:
512:                // Ensure minimal escaping. Use of 'lax' URI and URLCodec 
513:                // means minimal escaping isn't necessarily complete/consistent.
514:                // There is a chance such lax encoding will throw exceptions
515:                // later at inconvenient times. 
516:                //
517:                // One reason for these bad escapings -- though not the only --
518:                // is that the page is using an encoding other than the ASCII or the
519:                // UTF-8 that is our default URI encoding.  In this case the parent
520:                // class is burping on the passed URL encoding.  If the page encoding
521:                // was passed into this factory, the encoding seems to be parsed
522:                // correctly (See the testEscapedEncoding unit test).
523:                //
524:                // This fixup may cause us to miss content.  There is the charset case
525:                // noted above.  TODO: Look out for cases where we fail other than for
526:                // the above given reason which will be fixed when we address
527:                // '[ 913687 ] Make extractors interrogate for charset'.
528:
529:                uriPath = ensureMinimalEscaping(uriPath, charset);
530:                uriQuery = ensureMinimalEscaping(uriQuery, charset,
531:                        LaxURLCodec.QUERY_SAFE);
532:
533:                // Preallocate.  The '1's and '2's in below are space for ':',
534:                // '//', etc. URI characters.
535:                MutableString s = new MutableString(
536:                        ((uriScheme != null) ? uriScheme.length() : 0)
537:                                + 1 // ';' 
538:                                + ((uriAuthority != null) ? uriAuthority
539:                                        .length() : 0)
540:                                + 2 // '//'
541:                                + ((uriPath != null) ? uriPath.length() : 0)
542:                                + 1 // '?'
543:                                + ((uriQuery != null) ? uriQuery.length() : 0));
544:                appendNonNull(s, uriScheme, ":", true);
545:                appendNonNull(s, uriAuthority, "//", false);
546:                appendNonNull(s, uriPath, "", false);
547:                appendNonNull(s, uriQuery, "?", false);
548:                return s.toString();
549:            }
550:
551:            /**
552:             * If http(s) scheme, check scheme specific part begins '//'.
553:             * @throws URIException 
554:             * @see http://www.faqs.org/rfcs/rfc1738.html Section 3.1. Common Internet
555:             * Scheme Syntax
556:             */
557:            protected void checkHttpSchemeSpecificPartSlashPrefix(
558:                    final URI base, final String scheme,
559:                    final String schemeSpecificPart) throws URIException {
560:                // Only apply this check if no base.
561:                if (base != null) {
562:                    return;
563:                }
564:                if (scheme == null || scheme.length() <= 0) {
565:                    return;
566:                }
567:                if (!scheme.equals("http") && !scheme.equals("https")) {
568:                    return;
569:                }
570:                if (!schemeSpecificPart.startsWith("//")) {
571:                    throw new URIException("http scheme specific part must "
572:                            + "begin '//': " + schemeSpecificPart);
573:                }
574:                if (schemeSpecificPart.length() <= 2) {
575:                    throw new URIException("http scheme specific part is "
576:                            + "too short: " + schemeSpecificPart);
577:                }
578:            }
579:
580:            /**
581:             * Fixup 'authority' portion of URI, by removing any stray 
582:             * encoded spaces, lowercasing any domain names, and applying
583:             * IDN-punycoding to Unicode domains. 
584:             * 
585:             * @param uriAuthority the authority string to fix
586:             * @return fixed version
587:             * @throws URIException
588:             */
589:            private String fixupAuthority(String uriAuthority)
590:                    throws URIException {
591:                // Lowercase the host part of the uriAuthority; don't destroy any
592:                // userinfo capitalizations.  Make sure no illegal characters in
593:                // domainlabel substring of the uri authority.
594:                if (uriAuthority != null) {
595:                    // Get rid of any trailing escaped spaces:
596:                    // http://www.archive.org%20.  Rare but happens.
597:                    // TODO: reevaluate: do IE or firefox do such mid-URI space-removal?
598:                    // if not, we shouldn't either. 
599:                    while (uriAuthority.endsWith(ESCAPED_SPACE)) {
600:                        uriAuthority = uriAuthority.substring(0, uriAuthority
601:                                .length() - 3);
602:                    }
603:
604:                    // lowercase & IDN-punycode only the domain portion
605:                    int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
606:                    int portColonIndex = uriAuthority.indexOf(COLON,
607:                            (atIndex < 0) ? 0 : atIndex);
608:                    if (atIndex < 0 && portColonIndex < 0) {
609:                        // most common case: neither userinfo nor port
610:                        return fixupDomainlabel(uriAuthority);
611:                    } else if (atIndex < 0 && portColonIndex > -1) {
612:                        // next most common: port but no userinfo
613:                        String domain = fixupDomainlabel(uriAuthority
614:                                .substring(0, portColonIndex));
615:                        String port = uriAuthority.substring(portColonIndex);
616:                        return domain + port;
617:                    } else if (atIndex > -1 && portColonIndex < 0) {
618:                        // uncommon: userinfo, no port
619:                        String userinfo = uriAuthority
620:                                .substring(0, atIndex + 1);
621:                        String domain = fixupDomainlabel(uriAuthority
622:                                .substring(atIndex + 1));
623:                        return userinfo + domain;
624:                    } else {
625:                        // uncommon: userinfo, port
626:                        String userinfo = uriAuthority
627:                                .substring(0, atIndex + 1);
628:                        String domain = fixupDomainlabel(uriAuthority
629:                                .substring(atIndex + 1, portColonIndex));
630:                        String port = uriAuthority.substring(portColonIndex);
631:                        return userinfo + domain + port;
632:                    }
633:                }
634:                return uriAuthority;
635:            }
636:
637:            /**
638:             * Fixup the domain label part of the authority.
639:             * 
640:             * We're more lax than the spec. in that we allow underscores.
641:             * 
642:             * @param label Domain label to fix.
643:             * @return Return fixed domain label.
644:             * @throws URIException
645:             */
646:            private String fixupDomainlabel(String label) throws URIException {
647:
648:                // apply IDN-punycoding, as necessary
649:                try {
650:                    // TODO: optimize: only apply when necessary, or
651:                    // keep cache of recent encodings
652:                    label = IDNA.toASCII(label);
653:                } catch (IDNAException e) {
654:                    if (TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN, label)) {
655:                        // domain name has ACE prefix, leading/trailing dash, or 
656:                        // underscore -- but is still a name we wish to tolerate;
657:                        // simply continue
658:                    } else {
659:                        // problematic domain: neither ASCII acceptable characters
660:                        // nor IDN-punycodable, so throw exception 
661:                        // TODO: change to HeritrixURIException so distinguishable
662:                        // from URIExceptions in library code
663:                        URIException ue = new URIException(e + " " + label);
664:                        ue.initCause(e);
665:                        throw ue;
666:                    }
667:                }
668:                label = label.toLowerCase();
669:                return label;
670:            }
671:
672:            /**
673:             * Ensure that there all characters needing escaping
674:             * in the passed-in String are escaped. Stray '%' characters
675:             * are *not* escaped, as per browser behavior. 
676:             * 
677:             * @param u String to escape
678:             * @param charset 
679:             * @return string with any necessary escaping applied
680:             */
681:            private String ensureMinimalEscaping(String u, final String charset) {
682:                return ensureMinimalEscaping(u, charset,
683:                        LaxURLCodec.EXPANDED_URI_SAFE);
684:            }
685:
686:            /**
687:             * Ensure that there all characters needing escaping
688:             * in the passed-in String are escaped. Stray '%' characters
689:             * are *not* escaped, as per browser behavior. 
690:             * 
691:             * @param u String to escape
692:             * @param charset 
693:             * @param bitset 
694:             * @return string with any necessary escaping applied
695:             */
696:            private String ensureMinimalEscaping(String u,
697:                    final String charset, final BitSet bitset) {
698:                if (u == null) {
699:                    return null;
700:                }
701:                for (int i = 0; i < u.length(); i++) {
702:                    char c = u.charAt(i);
703:                    if (!bitset.get(c)) {
704:                        try {
705:                            u = LaxURLCodec.DEFAULT.encode(bitset, u, charset);
706:                        } catch (UnsupportedEncodingException e) {
707:                            e.printStackTrace();
708:                        }
709:                        break;
710:                    }
711:                }
712:                return u;
713:            }
714:
715:            /**
716:             * Escape any whitespace found.
717:             * 
718:             * The parent class takes care of the bulk of escaping.  But if any
719:             * instance of escaping is found in the URI, then we ask for parent
720:             * to do NO escaping.  Here we escape any whitespace found irrespective
721:             * of whether the uri has already been escaped.  We do this for
722:             * case where uri has been judged already-escaped only, its been
723:             * incompletly done and whitespace remains.  Spaces, etc., in the URI are
724:             * a real pain.  Their presence will break log file and ARC parsing.
725:             * @param uri URI string to check.
726:             * @return uri with spaces escaped if any found.
727:             */
728:            protected String escapeWhitespace(String uri) {
729:                // Just write a new string anyways.  The perl '\s' is not
730:                // as inclusive as the Character.isWhitespace so there are
731:                // whitespace characters we could miss.  So, rather than
732:                // write some awkward regex, just go through the string
733:                // a character at a time.  Only create buffer first time
734:                // we find a space.
735:                MutableString buffer = null;
736:                for (int i = 0; i < uri.length(); i++) {
737:                    char c = uri.charAt(i);
738:                    if (Character.isWhitespace(c)) {
739:                        if (buffer == null) {
740:                            buffer = new MutableString(uri.length() + 2 /*If space, two extra characters (at least)*/);
741:                            buffer.append(uri.substring(0, i));
742:                        }
743:                        buffer.append("%");
744:                        String hexStr = Integer.toHexString(c);
745:                        if ((hexStr.length() % 2) > 0) {
746:                            buffer.append("0");
747:                        }
748:                        buffer.append(hexStr);
749:
750:                    } else {
751:                        if (buffer != null) {
752:                            buffer.append(c);
753:                        }
754:                    }
755:                }
756:                return (buffer != null) ? buffer.toString() : uri;
757:            }
758:
759:            /**
760:             * Check port on passed http authority.  Make sure the size is not larger
761:             * than allowed: See the 'port' definition on this
762:             * page, http://www.kerio.com/manual/wrp/en/418.htm.
763:             * Also, we've seen port numbers of '0080' whose leading zeros confuse
764:             * the parent class. Strip the leading zeros.
765:             *
766:             * @param uriAuthority
767:             * @return Null or an amended port number.
768:             * @throws URIException
769:             */
770:            private String checkPort(String uriAuthority) throws URIException {
771:                Matcher m = PORTREGEX.matcher(uriAuthority);
772:                if (m.matches()) {
773:                    String no = m.group(2);
774:                    if (no != null && no.length() > 0) {
775:                        // First check if the port has leading zeros
776:                        // as in '0080'.  Strip them if it has and
777:                        // then reconstitute the uriAuthority.  Be careful
778:                        // of cases where port is '0' or '000'.
779:                        while (no.charAt(0) == '0' && no.length() > 1) {
780:                            no = no.substring(1);
781:                        }
782:                        uriAuthority = m.group(1) + no;
783:                        // Now makesure the number is legit.
784:                        int portNo = Integer.parseInt(no);
785:                        if (portNo <= 0 || portNo > 65535) {
786:                            throw new URIException("Port out of bounds: "
787:                                    + uriAuthority);
788:                        }
789:                    }
790:                }
791:                return uriAuthority;
792:            }
793:
794:            /**
795:             * @param b Buffer to append to.
796:             * @param str String to append if not null.
797:             * @param substr Suffix or prefix to use if <code>str</code> is not null.
798:             * @param suffix True if <code>substr</code> is a suffix.
799:             */
800:            private void appendNonNull(MutableString b, String str,
801:                    String substr, boolean suffix) {
802:                if (str != null && str.length() > 0) {
803:                    if (!suffix) {
804:                        b.append(substr);
805:                    }
806:                    b.append(str);
807:                    if (suffix) {
808:                        b.append(substr);
809:                    }
810:                }
811:            }
812:
813:            /**
814:             * @param str String to work on.
815:             * @param prefix Prefix to strip if present.
816:             * @return <code>str</code> w/o <code>prefix</code>.
817:             */
818:            private String stripPrefix(String str, String prefix) {
819:                return str.startsWith(prefix) ? str.substring(prefix.length(),
820:                        str.length()) : str;
821:            }
822:
823:            /**
824:             * @param str String to work on.
825:             * @param tail Tail to strip if present.
826:             * @return <code>str</code> w/o <code>tail</code>.
827:             */
828:            private static String stripTail(String str, String tail) {
829:                return str.endsWith(tail) ? str.substring(0, str.length()
830:                        - tail.length()) : str;
831:            }
832:
833:            /**
834:             * @param element to examine.
835:             * @return Null if passed null or an empty string otherwise
836:             * <code>element</code>.
837:             */
838:            private String checkUriElement(String element) {
839:                return (element == null || element.length() <= 0) ? null
840:                        : element;
841:            }
842:
843:            /**
844:             * @param element to examine and lowercase if non-null.
845:             * @return Null if passed null or an empty string otherwise
846:             * <code>element</code> lowercased.
847:             */
848:            private String checkUriElementAndLowerCase(String element) {
849:                String tmp = checkUriElement(element);
850:                return (tmp != null) ? tmp.toLowerCase() : tmp;
851:            }
852:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.