Source Code Cross Referenced for UURIFactoryTest.java in  » Web-Crawler » heritrix » org » archive » net » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.net 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* UURIFactoryTest
002:         *
003:         * $Id: UURIFactoryTest.java 5106 2007-05-01 00:07:29Z gojomo $
004:         *
005:         * Created on Apr 2, 2004
006:         *
007:         * Copyright (C) 2004 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:
026:        package org.archive.net;
027:
028:        import java.util.Iterator;
029:        import java.util.TreeMap;
030:
031:        import junit.framework.TestCase;
032:
033:        import org.apache.commons.httpclient.URIException;
034:
035:        /**
036:         * Test UURIFactory for proper UURI creation across variety of
037:         * important/tricky cases.
038:         * 
039:         * Be careful writing this file.  Make sure you write it with UTF-8 encoding.
040:         *
041:         * @author igor stack gojomo
042:         */
043:        public class UURIFactoryTest extends TestCase {
044:
045:            public final void testEscaping() throws URIException {
046:                // Note: single quote is not being escaped by URI class.
047:                final String ESCAPED_URISTR = "http://archive.org/"
048:                        + UURIFactory.ESCAPED_SPACE + UURIFactory.ESCAPED_SPACE
049:                        + UURIFactory.ESCAPED_CIRCUMFLEX
050:                        + UURIFactory.ESCAPED_QUOT + UURIFactory.SQUOT
051:                        + UURIFactory.ESCAPED_APOSTROPH
052:                        + UURIFactory.ESCAPED_LSQRBRACKET
053:                        + UURIFactory.ESCAPED_RSQRBRACKET
054:                        + UURIFactory.ESCAPED_LCURBRACKET
055:                        + UURIFactory.ESCAPED_RCURBRACKET + UURIFactory.SLASH
056:                        + "a.gif"; // NBSP and SPACE should be trimmed;
057:
058:                final String URISTR = "http://archive.org/.././" + "\u00A0"
059:                        + UURIFactory.SPACE + UURIFactory.CIRCUMFLEX
060:                        + UURIFactory.QUOT + UURIFactory.SQUOT
061:                        + UURIFactory.APOSTROPH + UURIFactory.LSQRBRACKET
062:                        + UURIFactory.RSQRBRACKET + UURIFactory.LCURBRACKET
063:                        + UURIFactory.RCURBRACKET + UURIFactory.BACKSLASH
064:                        + "test/../a.gif" + "\u00A0" + UURIFactory.SPACE;
065:
066:                UURI uuri = UURIFactory.getInstance(URISTR);
067:                final String uuriStr = uuri.toString();
068:                assertEquals("expected escaping", ESCAPED_URISTR, uuriStr);
069:            }
070:
071:            public final void testUnderscoreMakesPortParseFail()
072:                    throws URIException {
073:                UURI uuri = UURIFactory
074:                        .getInstance("http://one-two_three:8080/index.html");
075:                int port = uuri.getPort();
076:                assertTrue("Failed find of port " + uuri, port == 8080);
077:            }
078:
079:            public final void testRelativeURIWithTwoSlashes()
080:                    throws URIException {
081:                UURI base = UURIFactory.getInstance("http://www.archive.org");
082:                UURI uuri = UURIFactory.getInstance(base, "one//index.html");
083:                assertTrue("Doesn't do right thing with two slashes " + uuri,
084:                        uuri.toString().equals(
085:                                "http://www.archive.org/one//index.html"));
086:            }
087:
088:            public final void testTrailingEncodedSpace() throws URIException {
089:                UURI uuri = UURIFactory
090:                        .getInstance("http://www.nps-shoes.co.uk%20");
091:                assertTrue("Doesn't strip trailing encoded space 1 " + uuri,
092:                        uuri.toString().equals("http://www.nps-shoes.co.uk/"));
093:                uuri = UURIFactory
094:                        .getInstance("http://www.nps-shoes.co.uk%20%20%20");
095:                assertTrue("Doesn't strip trailing encoded space 2 " + uuri,
096:                        uuri.toString().equals("http://www.nps-shoes.co.uk/"));
097:            }
098:
099:            public final void testPort0080is80() throws URIException {
100:                UURI uuri = UURIFactory.getInstance("http://archive.org:0080");
101:                assertTrue("Doesn't strip leading zeros " + uuri, uuri
102:                        .toString().equals("http://archive.org/"));
103:            }
104:
105:            // DISABLING TEST AS PRECURSOR TO ELIMINATION
106:            // the problematic input given -- specifically the "%6s" incomplete uri-escape,
107:            // shouldn't necessarily be rejected as a bad URI. IE and Firefox, at least, 
108:            // will  attempt to fetch such an URL (getting, in this case against that ad 
109:            // server, a bad-request error). Ideally, we'd generate exactly the same 
110:            // request against the server as they do. However, with the most recent 
111:            // fixup for stray '%' signs, we come close, but not exactly. That's enough
112:            // to cause this test to fail (it's not getting the expected exception) but
113:            // our almost-URI, which might be what was intended, is better than trying 
114:            // nothing.
115:            //    public final void testBadPath() {
116:            //        String message = null;
117:            //        try {
118:            //            UURIFactory.getInstance("http://ads.as4x.tmcs.net/" +
119:            //                "html.ng/site=cs&pagepos=102&page=home&adsize=1x1&context=" +
120:            //                "generic&Params.richmedia=yes%26city%3Dseattle%26" +
121:            //                "rstid%3D2415%26market_id%3D86%26brand%3Dcitysearch" +
122:            //                "%6state%3DWA");
123:            //        } catch (URIException e) {
124:            //            message = e.getMessage();
125:            //        }
126:            //        assertNotNull("Didn't get expected exception.", message);
127:            //    }   
128:
129:            public final void testEscapeEncoding() throws URIException {
130:                UURI uuri = UURIFactory.getInstance("http://www.y1y1.com/"
131:                        + "albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg",
132:                        "windows-1256");
133:                uuri.getPath();
134:            }
135:
136:            public final void testTooLongAfterEscaping() {
137:                StringBuffer buffer = new StringBuffer(
138:                        "http://www.archive.org/a/");
139:                // Append bunch of spaces.  When escaped, they'll triple in size.
140:                for (int i = 0; i < 1024; i++) {
141:                    buffer.append(" ");
142:                }
143:                buffer.append("/index.html");
144:                String message = null;
145:                try {
146:                    UURIFactory.getInstance(buffer.toString());
147:                } catch (URIException e) {
148:                    message = e.getMessage();
149:                }
150:                assertTrue("Wrong or no exception: " + message,
151:                        (message != null)
152:                                && message
153:                                        .startsWith("Created (escaped) uuri >"));
154:            }
155:
156:            public final void testFtpUris() throws URIException {
157:                final String FTP = "ftp";
158:                final String AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn";
159:                final String PATH = "/clzreceive/";
160:                final String uri = FTP + "://" + AUTHORITY + PATH;
161:                UURI uuri = UURIFactory.getInstance(uri);
162:                assertTrue(
163:                        "Failed to get matching scheme: " + uuri.getScheme(),
164:                        (uuri.getScheme()).equals(FTP));
165:                assertTrue("Failed to get matching authority: "
166:                        + uuri.getAuthority(), (uuri.getAuthority())
167:                        .equals(AUTHORITY));
168:                assertTrue("Failed to get matching path: " + uuri.getPath(),
169:                        (uuri.getPath()).equals(PATH));
170:            }
171:
172:            public final void testWhitespaceEscaped() throws URIException {
173:                // Test that we get all whitespace even if the uri is
174:                // already escaped.
175:                String uri = "http://archive.org/index%25 .html";
176:                String tgtUri = "http://archive.org/index%25%20.html";
177:                UURI uuri = UURIFactory.getInstance(uri);
178:                assertTrue("Not equal " + uuri.toString(), uuri.toString()
179:                        .equals(tgtUri));
180:                uri = "http://archive.org/index%25\u001D.html";
181:                tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();
182:                uuri = UURIFactory.getInstance(uri);
183:                assertEquals("whitespace escaping", tgtUri, uuri.toString());
184:                uri = "http://gemini.info.usaid.gov/directory/"
185:                        + "pbResults.cfm?&urlNameLast=Rumplestiltskin";
186:                tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?"
187:                        + "name=Ebenezer%20+Rumplestiltskin,&location=RRB%20%20%20%205%2E08%2D006";
188:                uuri = UURIFactory.getInstance(UURIFactory.getInstance(uri),
189:                        "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location="
190:                                + "RRB%20%20%20%205%2E08%2D006");
191:                assertEquals("whitespace escaping", tgtUri, uuri.toString());
192:            }
193:
194:            //	public final void testFailedGetPath() throws URIException {
195:            //		final String path = "/RealMedia/ads/" +
196:            //		"click_lx.ads/%%PAGE%%/%%RAND%%/%%POS%%/%%CAMP%%/empty";
197:            //        // decoding in getPath will interpret %CA as 8-bit escaped char,
198:            //        // possibly incomplete
199:            //		final String uri = "http://ads.nandomedia.com" + path;
200:            //		final UURI uuri = UURIFactory.getInstance(uri);
201:            //		String foundPath = uuri.getPath();
202:            //		assertEquals("unexpected path", path, foundPath);
203:            //	}
204:
205:            public final void testDnsHost() throws URIException {
206:                String uri = "dns://ads.nandomedia.com:81/one.html";
207:                UURI uuri = UURIFactory.getInstance(uri);
208:                String host = uuri.getReferencedHost();
209:                assertTrue("Host is wrong " + host, host
210:                        .equals("ads.nandomedia.com"));
211:                uri = "dns:ads.nandomedia.com";
212:                uuri = UURIFactory.getInstance(uri);
213:                host = uuri.getReferencedHost();
214:                assertTrue("Host is wrong " + host, host
215:                        .equals("ads.nandomedia.com"));
216:                uri = "dns:ads.nandomedia.com?a=b";
217:                uuri = UURIFactory.getInstance(uri);
218:                host = uuri.getReferencedHost();
219:                assertTrue("Host is wrong " + host, host
220:                        .equals("ads.nandomedia.com"));
221:            }
222:
223:            public final void testPercentEscaping() throws URIException {
224:                final String uri = "http://archive.org/%a%%%%%.html";
225:                // tests indicate firefox (1.0.6) does not encode '%' at all
226:                final String tgtUri = "http://archive.org/%a%%%%%.html";
227:                UURI uuri = UURIFactory.getInstance(uri);
228:                assertEquals("Not equal", tgtUri, uuri.toString());
229:            }
230:
231:            public final void testRelativeDblPathSlashes() throws URIException {
232:                UURI base = UURIFactory
233:                        .getInstance("http://www.archive.org/index.html");
234:                UURI uuri = UURIFactory.getInstance(base,
235:                        "JIGOU//KYC//INDEX.HTM");
236:                assertTrue("Double slash not working " + uuri.toString(), uuri
237:                        .getPath().equals("/JIGOU//KYC//INDEX.HTM"));
238:            }
239:
240:            public final void testRelativeWithScheme() throws URIException {
241:                UURI base = UURIFactory
242:                        .getInstance("http://www.example.com/some/page");
243:                UURI uuri = UURIFactory.getInstance(base, "http:boo");
244:                assertTrue("Relative with scheme not working "
245:                        + uuri.toString(), uuri.toString().equals(
246:                        "http://www.example.com/some/boo"));
247:            }
248:
249:            public final void testBadBaseResolve() throws URIException {
250:                UURI base = UURIFactory
251:                        .getInstance("http://license.joins.com/board/"
252:                                + "etc_board_list.asp?board_name=new_main&b_type=&nPage="
253:                                + "2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage="
254:                                + "notice&gate=02");
255:                UURIFactory.getInstance(base, "http://www.changeup.com/...</a");
256:            }
257:
258:            public final void testTilde() throws URIException {
259:                noChangeExpected("http://license.joins.com/~igor");
260:            }
261:
262:            public final void testCurlies() throws URIException {
263:                // Firefox allows curlies in the query string portion of a URL only
264:                // (converts curlies if they are in the path portion ahead of the
265:                // query string).
266:                UURI uuri = noChangeExpected("http://license.joins.com/igor?one={curly}");
267:                assertEquals(uuri.getQuery(), "one={curly}");
268:                assertEquals(
269:                        UURIFactory.getInstance(
270:                                "http://license.joins.com/igor{curly}.html")
271:                                .toString(),
272:                        "http://license.joins.com/igor%7Bcurly%7D.html");
273:                boolean exception = false;
274:                try {
275:                    UURIFactory
276:                            .getInstance("http://license.{curly}.com/igor.html");
277:                } catch (URIException u) {
278:                    exception = true;
279:                }
280:                assertTrue("Did not get exception.", exception);
281:            }
282:
283:            protected UURI noChangeExpected(final String original)
284:                    throws URIException {
285:                UURI uuri = UURIFactory.getInstance(original);
286:                assertEquals(original, uuri.toString());
287:                return uuri;
288:            }
289:
290:            public final void testTrimSpaceNBSP() throws URIException {
291:                final String uri = "   http://archive.org/DIR WITH SPACES/"
292:                        + UURIFactory.NBSP + "home.html    " + UURIFactory.NBSP
293:                        + "   ";
294:                final String tgtUri = "http://archive.org/DIR%20WITH%20SPACES/%20home.html";
295:                UURI uuri = UURIFactory.getInstance(uri);
296:                assertTrue("Not equal " + uuri.toString(), uuri.toString()
297:                        .equals(tgtUri));
298:            }
299:
300:            /**
301:             * Test space plus encoding ([ 1010966 ] crawl.log has URIs with spaces in them).
302:             * See <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1010966&group_id=73833&atid=539099">[ 1010966 ] crawl.log has URIs with spaces in them</a>.
303:             * @throws URIException
304:             */
305:            public final void testSpaceDoubleEncoding() throws URIException {
306:                final String uri = "http://www.brook.edu/i.html? %20taxonomy=Politics";
307:                final String encodedUri = "http://www.brook.edu/i.html?%20%20taxonomy=Politics";
308:                UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
309:                assertTrue("Not equal " + uuri.toString(), uuri.toString()
310:                        .equals(encodedUri));
311:            }
312:
313:            /**
314:             * Test for doubly-encoded sequences.
315:             * See <a href="https://sourceforge.net/tracker/index.php?func=detail&aid=966219&group_id=73833&atid=539099">[ 966219 ] UURI doubly-encodes %XX sequences</a>.
316:             * @throws URIException
317:             */
318:            public final void testDoubleEncoding() throws URIException {
319:                final char ae = '\u00E6';
320:                final String uri = "http://archive.org/DIR WITH SPACES/home"
321:                        + ae + ".html";
322:                final String encodedUri = "http://archive.org/DIR%20WITH%20SPACES/home%E6.html";
323:                UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
324:                assertEquals("single encoding", encodedUri, uuri.toString());
325:                // Dbl-encodes.
326:                uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
327:                uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
328:                assertEquals("double encoding", encodedUri, uuri.toString());
329:                // Do default utf-8 test.
330:                uuri = UURIFactory.getInstance(uri);
331:                final String encodedUtf8Uri = "http://archive.org/DIR%20WITH%20SPACES/home%C3%A6.html";
332:                assertEquals("Not equal utf8", encodedUtf8Uri, uuri.toString());
333:                // Now dbl-encode.
334:                uuri = UURIFactory.getInstance(uuri.toString());
335:                uuri = UURIFactory.getInstance(uuri.toString());
336:                assertEquals("Not equal (dbl-encoding) utf8", encodedUtf8Uri,
337:                        uuri.toString());
338:            }
339:
340:            /**
341:             * Test for syntax errors stop page parsing.
342:             * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ] URI Syntax Errors stop page parsing</a>
343:             * @throws URIException
344:             */
345:            public final void testThreeSlashes() throws URIException {
346:                UURI goodURI = UURIFactory
347:                        .getInstance("http://lcweb.loc.gov/rr/goodtwo.html");
348:                String uuri = "http:///lcweb.loc.gov/rr/goodtwo.html";
349:                UURI rewrittenURI = UURIFactory.getInstance(uuri);
350:                assertTrue("Not equal " + goodURI + ", " + uuri, goodURI
351:                        .toString().equals(rewrittenURI.toString()));
352:                uuri = "http:////lcweb.loc.gov/rr/goodtwo.html";
353:                rewrittenURI = UURIFactory.getInstance(uuri);
354:                assertTrue("Not equal " + goodURI + ", " + uuri, goodURI
355:                        .toString().equals(rewrittenURI.toString()));
356:                // Check https.
357:                goodURI = UURIFactory
358:                        .getInstance("https://lcweb.loc.gov/rr/goodtwo.html");
359:                uuri = "https:////lcweb.loc.gov/rr/goodtwo.html";
360:                rewrittenURI = UURIFactory.getInstance(uuri);
361:                assertTrue("Not equal " + goodURI + ", " + uuri, goodURI
362:                        .toString().equals(rewrittenURI.toString()));
363:            }
364:
365:            public final void testNoScheme() {
366:                boolean expectedException = false;
367:                String uuri = "www.loc.gov/rr/european/egw/polishex.html";
368:                try {
369:                    UURIFactory.getInstance(uuri);
370:                } catch (URIException e) {
371:                    // Expected exception.
372:                    expectedException = true;
373:                }
374:                assertTrue("Didn't get expected exception: " + uuri,
375:                        expectedException);
376:            }
377:
378:            public final void testRelative() throws URIException {
379:                UURI uuriTgt = UURIFactory
380:                        .getInstance("http://archive.org:83/home.html");
381:                UURI uri = UURIFactory
382:                        .getInstance("http://archive.org:83/one/two/three.html");
383:                UURI uuri = UURIFactory.getInstance(uri, "/home.html");
384:                assertTrue("Not equal", uuriTgt.toString().equals(
385:                        uuri.toString()));
386:            }
387:
388:            /**
389:             * Test that an empty uuri does the right thing -- that we get back the
390:             * base.
391:             *
392:             * @throws URIException
393:             */
394:            public final void testRelativeEmpty() throws URIException {
395:                UURI uuriTgt = UURIFactory
396:                        .getInstance("http://archive.org:83/one/two/three.html");
397:                UURI uri = UURIFactory
398:                        .getInstance("http://archive.org:83/one/two/three.html");
399:                UURI uuri = UURIFactory.getInstance(uri, "");
400:                assertTrue("Empty length don't work", uuriTgt.toString()
401:                        .equals(uuri.toString()));
402:            }
403:
404:            public final void testAbsolute() throws URIException {
405:                UURI uuriTgt = UURIFactory
406:                        .getInstance("http://archive.org:83/home.html");
407:                UURI uri = UURIFactory
408:                        .getInstance("http://archive.org:83/one/two/three.html");
409:                UURI uuri = UURIFactory.getInstance(uri,
410:                        "http://archive.org:83/home.html");
411:                assertTrue("Not equal", uuriTgt.toString().equals(
412:                        uuri.toString()));
413:            }
414:
415:            /**
416:             * Test for [ 962892 ] UURI accepting/creating unUsable URIs (bad hosts).
417:             * @see <a href="https://sourceforge.net/tracker/?func=detail&atid=539099&aid=962892&group_id=73833">[ 962892 ] UURI accepting/creating unUsable URIs (bad hosts)</a>
418:             */
419:            public final void testHostWithLessThan() {
420:                checkExceptionOnIllegalDomainlabel("http://www.betamobile.com</A");
421:                checkExceptionOnIllegalDomainlabel("http://C|/unzipped/426/spacer.gif");
422:                checkExceptionOnIllegalDomainlabel("http://www.lycos.co.uk\"/l/b/\"");
423:            }
424:
425:            /**
426:             * Test for [ 1012520 ] UURI.length() &gt; 2k.
427:             * @throws URIException
428:             * @see <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1012520&group_id=73833&atid=539099">[ 1012520 ] UURI.length() &gt; 2k</a>
429:             */
430:            public final void test2kURI() throws URIException {
431:                final StringBuffer buffer = new StringBuffer("http://a.b");
432:                final String subPath = "/123456789";
433:                for (int i = 0; i < 207; i++) {
434:                    buffer.append(subPath);
435:                }
436:                // String should be 2080 characters long.  Legal.
437:                UURIFactory.getInstance(buffer.toString());
438:                boolean gotException = false;
439:                // Add ten more characters and make size illegal.
440:                buffer.append(subPath);
441:                try {
442:                    UURIFactory.getInstance(buffer.toString());
443:                } catch (URIException e) {
444:                    gotException = true;
445:                }
446:                assertTrue("No expected exception complaining about long URI",
447:                        gotException);
448:            }
449:
450:            private void checkExceptionOnIllegalDomainlabel(String uuri) {
451:                boolean expectedException = false;
452:                try {
453:                    UURIFactory.getInstance(uuri);
454:                } catch (URIException e) {
455:                    // Expected exception.
456:                    expectedException = true;
457:                }
458:                assertTrue("Didn't get expected exception: " + uuri,
459:                        expectedException);
460:            }
461:
462:            /**
463:             * Test for doing separate DNS lookup for same host
464:             *
465:             * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788277&group_id=73833&atid=539099">[ 788277 ] Doing separate DNS lookup for same host</a>
466:             * @throws URIException
467:             */
468:            public final void testHostWithPeriod() throws URIException {
469:                UURI uuri1 = UURIFactory
470:                        .getInstance("http://www.loc.gov./index.html");
471:                UURI uuri2 = UURIFactory
472:                        .getInstance("http://www.loc.gov/index.html");
473:                assertEquals("Failed equating hosts with dot", uuri1.getHost(),
474:                        uuri2.getHost());
475:            }
476:
477:            /**
478:             * Test for NPE in java.net.URI.encode
479:             *
480:             * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=874220&group_id=73833&atid=539099">[ 874220 ] NPE in java.net.URI.encode</a>
481:             * @throws URIException
482:             */
483:            public final void testHostEncodedChars() throws URIException {
484:                String s = "http://g.msn.co.kr/0nwkokr0/00/19??"
485:                        + "PS=10274&NC=10009&CE=42&CP=949&HL="
486:                        + "&#65533;&#65533;&#65533;?&#65533;&#65533;";
487:                assertNotNull("Encoded chars " + s, UURIFactory.getInstance(s));
488:            }
489:
490:            /**
491:             * Test for java.net.URI parses %20 but getHost null
492:             *
493:             * See <a href="https://sourceforge.net/tracker/?func=detail&aid=927940&group_id=73833&atid=539099">[ 927940 ] java.net.URI parses %20 but getHost null</a>
494:             */
495:            public final void testSpaceInHost() {
496:                boolean expectedException = false;
497:                try {
498:                    UURIFactory
499:                            .getInstance("http://www.local-regions.odpm%20.gov.uk"
500:                                    + "/lpsa/challenge/pdf/propect.pdf");
501:                } catch (URIException e) {
502:                    expectedException = true;
503:                }
504:                assertTrue("Did not fail with escaped space.",
505:                        expectedException);
506:
507:                expectedException = false;
508:                try {
509:                    UURIFactory
510:                            .getInstance("http://www.local-regions.odpm .gov.uk"
511:                                    + "/lpsa/challenge/pdf/propect.pdf");
512:                } catch (URIException e) {
513:                    expectedException = true;
514:                }
515:                assertTrue("Did not fail with real space.", expectedException);
516:            }
517:
518:            /**
519:             * Test for java.net.URI chokes on hosts_with_underscores.
520:             *
521:             * @see  <a href="https://sourceforge.net/tracker/?func=detail&aid=808270&group_id=73833&atid=539099">[ 808270 ] java.net.URI chokes on hosts_with_underscores</a>
522:             * @throws URIException
523:             */
524:            public final void testHostWithUnderscores() throws URIException {
525:                UURI uuri = UURIFactory
526:                        .getInstance("http://x_underscore_underscore.2u.com.tw/nonexistent_page.html");
527:                assertEquals("Failed get of host with underscore",
528:                        "x_underscore_underscore.2u.com.tw", uuri.getHost());
529:            }
530:
531:            /**
532:             * Two dots for igor.
533:             */
534:            public final void testTwoDots() {
535:                boolean expectedException = false;
536:                try {
537:                    UURIFactory
538:                            .getInstance("http://x_underscore_underscore..2u.com/nonexistent_page.html");
539:                } catch (URIException e) {
540:                    expectedException = true;
541:                }
542:                assertTrue("Two dots did not throw exception",
543:                        expectedException);
544:            }
545:
546:            /**
547:             * Test for java.net.URI#getHost fails when leading digit.
548:             *
549:             * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=910120&group_id=73833&atid=539099">[ 910120 ] java.net.URI#getHost fails when leading digit.</a>
550:             * @throws URIException
551:             */
552:            public final void testHostWithDigit() throws URIException {
553:                UURI uuri = UURIFactory
554:                        .getInstance("http://0204chat.2u.com.tw/nonexistent_page.html");
555:                assertEquals("Failed get of host with digit",
556:                        "0204chat.2u.com.tw", uuri.getHost());
557:            }
558:
559:            /**
560:             * Test for Constraining java URI class.
561:             *
562:             * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=949548&group_id=73833&atid=539099">[ 949548 ] Constraining java URI class</a>
563:             */
564:            public final void testPort() {
565:                checkBadPort("http://www.tyopaikat.com:a/robots.txt");
566:                checkBadPort("http://158.144.21.3:80808/robots.txt");
567:                checkBadPort("http://pdb.rutgers.edu:81.rutgers.edu/robots.txt");
568:                checkBadPort("https://webmail.gse.harvard.edu:9100robots.txt/robots.txt");
569:                checkBadPort("https://webmail.gse.harvard.edu:0/robots.txt/robots.txt");
570:            }
571:
572:            /**
573:             * Test bad port throws exception.
574:             * @param uri URI with bad port to check.
575:             */
576:            private void checkBadPort(String uri) {
577:                boolean exception = false;
578:                try {
579:                    UURIFactory.getInstance(uri);
580:                } catch (URIException e) {
581:                    exception = true;
582:                }
583:                assertTrue("Didn't throw exception: " + uri, exception);
584:            }
585:
586:            /**
587:             * Preserve userinfo capitalization.
588:             * @throws URIException
589:             */
590:            public final void testUserinfo() throws URIException {
591:                final String authority = "stack:StAcK@www.tyopaikat.com";
592:                final String uri = "http://" + authority + "/robots.txt";
593:                UURI uuri = UURIFactory.getInstance(uri);
594:                assertEquals("Authority not equal", uuri.getAuthority(),
595:                        authority);
596:                /*
597:                String tmp = uuri.toString();
598:                assertTrue("URI not equal", tmp.equals(uri));
599:                 */
600:            }
601:
602:            /**
603:             * Test user info + port
604:             * @throws URIException
605:             */
606:            public final void testUserinfoPlusPort() throws URIException {
607:                final String userInfo = "stack:StAcK";
608:                final String authority = "www.tyopaikat.com";
609:                final int port = 8080;
610:                final String uri = "http://" + userInfo + "@" + authority + ":"
611:                        + port + "/robots.txt";
612:                UURI uuri = UURIFactory.getInstance(uri);
613:                assertEquals("Host not equal", authority, uuri.getHost());
614:                assertEquals("Userinfo Not equal", userInfo, uuri.getUserinfo());
615:                assertEquals("Port not equal", port, uuri.getPort());
616:                assertEquals("Authority wrong",
617:                        "stack:StAcK@www.tyopaikat.com:8080", uuri
618:                                .getAuthority());
619:                assertEquals("AuthorityMinusUserinfo wrong",
620:                        "www.tyopaikat.com:8080", uuri
621:                                .getAuthorityMinusUserinfo());
622:
623:            }
624:
625:            /**
626:             * Tests from rfc2396 with amendments to accomodate differences
627:             * intentionally added to make our URI handling like IEs.
628:             *
629:             * <pre>
630:             *       g:h           =  g:h
631:             *       g             =  http://a/b/c/g
632:             *       ./g           =  http://a/b/c/g
633:             *       g/            =  http://a/b/c/g/
634:             *       /g            =  http://a/g
635:             *       //g           =  http://g
636:             *       ?y            =  http://a/b/c/?y
637:             *       g?y           =  http://a/b/c/g?y
638:             *       #s            =  (current document)#s
639:             *       g#s           =  http://a/b/c/g#s
640:             *       g?y#s         =  http://a/b/c/g?y#s
641:             *       ;x            =  http://a/b/c/;x
642:             *       g;x           =  http://a/b/c/g;x
643:             *       g;x?y#s       =  http://a/b/c/g;x?y#s
644:             *       .             =  http://a/b/c/
645:             *       ./            =  http://a/b/c/
646:             *       ..            =  http://a/b/
647:             *       ../           =  http://a/b/
648:             *       ../g          =  http://a/b/g
649:             *       ../..         =  http://a/
650:             *       ../../        =  http://a/
651:             *       ../../g       =  http://a/g
652:             * </pre>
653:             *
654:             * @throws URIException
655:             */
656:            public final void testRFC2396Relative() throws URIException {
657:                UURI base = UURIFactory.getInstance("http://a/b/c/d;p?q");
658:                TreeMap<String, String> m = new TreeMap<String, String>();
659:                m.put("..", "http://a/b/");
660:                m.put("../", "http://a/b/");
661:                m.put("../g", "http://a/b/g");
662:                m.put("../..", "http://a/");
663:                m.put("../../", "http://a/");
664:                m.put("../../g", "http://a/g");
665:                m.put("g#s", "http://a/b/c/g#s");
666:                m.put("g?y#s ", "http://a/b/c/g?y#s");
667:                m.put(";x", "http://a/b/c/;x");
668:                m.put("g;x", "http://a/b/c/g;x");
669:                m.put("g;x?y#s", "http://a/b/c/g;x?y#s");
670:                m.put(".", "http://a/b/c/");
671:                m.put("./", "http://a/b/c/");
672:                m.put("g", "http://a/b/c/g");
673:                m.put("./g", "http://a/b/c/g");
674:                m.put("g/", "http://a/b/c/g/");
675:                m.put("/g", "http://a/g");
676:                m.put("//g", "http://g");
677:                m.put("?y", "http://a/b/c/?y");
678:                m.put("g?y", "http://a/b/c/g?y");
679:                // EXTRAS beyond the RFC set.
680:                // TODO: That these resolve to a path of /a/g might be wrong.  Perhaps
681:                // it should be '/g'?.
682:                m.put("/../../../../../../../../g", "http://a/g");
683:                m.put("../../../../../../../../g", "http://a/g");
684:                m.put("../G", "http://a/b/G");
685:                for (Iterator i = m.keySet().iterator(); i.hasNext();) {
686:                    String key = (String) i.next();
687:                    String value = (String) m.get(key);
688:                    UURI uuri = UURIFactory.getInstance(base, key);
689:                    assertTrue("Unexpected " + key + " " + value + " " + uuri,
690:                            uuri.equals(UURIFactory.getInstance(value)));
691:                }
692:            }
693:
694:            /**
695:             * A UURI should always be without a 'fragment' segment, which is
696:             * unused and irrelevant for network fetches. 
697:             *  
698:             * See [ 970666 ] #anchor links not trimmed, and thus recrawled 
699:             * 
700:             * @throws URIException
701:             */
702:            public final void testAnchors() throws URIException {
703:                UURI uuri = UURIFactory
704:                        .getInstance("http://www.example.com/path?query#anchor");
705:                assertEquals("Not equal", "http://www.example.com/path?query",
706:                        uuri.toString());
707:            }
708:
709:            /**
710:             * Ensure that URI strings beginning with a colon are treated
711:             * the same as browsers do (as relative, rather than as absolute
712:             * with zero-length scheme). 
713:             * 
714:             * @throws URIException
715:             */
716:            public void testStartsWithColon() throws URIException {
717:                UURI base = UURIFactory
718:                        .getInstance("http://www.example.com/path/page");
719:                UURI uuri = UURIFactory.getInstance(base, ":foo");
720:                assertEquals("derelativize starsWithColon", uuri.getURI(),
721:                        "http://www.example.com/path/:foo");
722:            }
723:
724:            /**
725:             * Ensure that stray trailing '%' characters do not prevent
726:             * UURI instances from being created, and are reasonably 
727:             * escaped when encountered. 
728:             *
729:             * @throws URIException
730:             */
731:            public void testTrailingPercents() throws URIException {
732:                String plainPath = "http://www.example.com/path%";
733:                UURI plainPathUuri = UURIFactory.getInstance(plainPath);
734:                assertEquals("plainPath getURI", plainPath, plainPathUuri
735:                        .getURI());
736:                assertEquals("plainPath getEscapedURI",
737:                        "http://www.example.com/path%", // browsers don't escape '%'
738:                        plainPathUuri.getEscapedURI());
739:
740:                String partiallyEscapedPath = "http://www.example.com/pa%20th%";
741:                UURI partiallyEscapedPathUuri = UURIFactory
742:                        .getInstance(partiallyEscapedPath);
743:                //        assertEquals("partiallyEscapedPath getURI", 
744:                //                "http://www.example.com/pa th%", // TODO: is this desirable?
745:                ////              partiallyEscapedPath,
746:                //                partiallyEscapedPathUuri.getURI());
747:                assertEquals("partiallyEscapedPath getEscapedURI",
748:                        "http://www.example.com/pa%20th%",
749:                        partiallyEscapedPathUuri.getEscapedURI());
750:
751:                String plainQueryString = "http://www.example.com/path?q=foo%";
752:                UURI plainQueryStringUuri = UURIFactory
753:                        .getInstance(plainQueryString);
754:                //        assertEquals("plainQueryString getURI", 
755:                //                plainQueryString,
756:                //                plainQueryStringUuri.getURI());
757:                assertEquals("plainQueryString getEscapedURI",
758:                        "http://www.example.com/path?q=foo%",
759:                        plainQueryStringUuri.getEscapedURI());
760:
761:                String partiallyEscapedQueryString = "http://www.example.com/pa%20th?q=foo%";
762:                UURI partiallyEscapedQueryStringUuri = UURIFactory
763:                        .getInstance(partiallyEscapedQueryString);
764:                assertEquals("partiallyEscapedQueryString getURI",
765:                        "http://www.example.com/pa th?q=foo%",
766:                        partiallyEscapedQueryStringUuri.getURI());
767:                assertEquals("partiallyEscapedQueryString getEscapedURI",
768:                        "http://www.example.com/pa%20th?q=foo%",
769:                        partiallyEscapedQueryStringUuri.getEscapedURI());
770:            }
771:
772:            /**
773:             * Ensure that stray '%' characters do not prevent
774:             * UURI instances from being created, and are reasonably 
775:             * escaped when encountered. 
776:             *
777:             * @throws URIException
778:             */
779:            public void testStrayPercents() throws URIException {
780:                String oneStray = "http://www.example.com/pa%th";
781:                UURI oneStrayUuri = UURIFactory.getInstance(oneStray);
782:                assertEquals("oneStray getURI", oneStray, oneStrayUuri.getURI());
783:                assertEquals("oneStray getEscapedURI",
784:                        "http://www.example.com/pa%th", // browsers don't escape '%'
785:                        oneStrayUuri.getEscapedURI());
786:
787:                String precededByValidEscape = "http://www.example.com/pa%20th%way";
788:                UURI precededByValidEscapeUuri = UURIFactory
789:                        .getInstance(precededByValidEscape);
790:                assertEquals("precededByValidEscape getURI",
791:                        "http://www.example.com/pa th%way", // getURI interprets escapes
792:                        precededByValidEscapeUuri.getURI());
793:                assertEquals("precededByValidEscape getEscapedURI",
794:                        "http://www.example.com/pa%20th%way",
795:                        precededByValidEscapeUuri.getEscapedURI());
796:
797:                String followedByValidEscape = "http://www.example.com/pa%th%20way";
798:                UURI followedByValidEscapeUuri = UURIFactory
799:                        .getInstance(followedByValidEscape);
800:                assertEquals("followedByValidEscape getURI",
801:                        "http://www.example.com/pa%th way", // getURI interprets escapes
802:                        followedByValidEscapeUuri.getURI());
803:                assertEquals("followedByValidEscape getEscapedURI",
804:                        "http://www.example.com/pa%th%20way",
805:                        followedByValidEscapeUuri.getEscapedURI());
806:            }
807:
808:            public void testEscapingNotNecessary() throws URIException {
809:                String escapesUnnecessary = "http://www.example.com/misc;reserved:chars@that&don't=need"
810:                        + "+escaping$even,though!you(might)initially?think#so";
811:                // expect everything but the #fragment
812:                String expected = escapesUnnecessary.substring(0,
813:                        escapesUnnecessary.length() - 3);
814:                assertEquals("escapes unnecessary", expected, UURIFactory
815:                        .getInstance(escapesUnnecessary).toString());
816:            }
817:
818:            public void testIdn() throws URIException {
819:                // See http://www.josefsson.org/idn.php.
820:                String idn1 = new String("http://räksmörgås.josefßon.org/");
821:                String puny1 = "http://xn--rksmrgs-5wao1o.josefsson.org/";
822:                assertEquals("encoding of " + idn1, puny1, UURIFactory
823:                        .getInstance(idn1).toString());
824:                String idn2 = "http://www.pølse.dk/";
825:                String puny2 = "http://www.xn--plse-gra.dk/";
826:                assertEquals("encoding of " + idn2, puny2, UURIFactory
827:                        .getInstance(idn2).toString());
828:            }
829:
830:            public void testNewLineInURL() throws URIException {
831:                UURI uuri = UURIFactory.getInstance("http://www.ar\rchive\n."
832:                        + "org/i\n\n\r\rndex.html");
833:                assertEquals("http://www.archive.org/index.html", uuri
834:                        .toString());
835:            }
836:
837:            public void testTabsInURL() throws URIException {
838:                UURI uuri = UURIFactory.getInstance("http://www.ar\tchive\t."
839:                        + "org/i\t\r\n\tndex.html");
840:                assertEquals("http://www.archive.org/index.html", uuri
841:                        .toString());
842:            }
843:
844:            public void testQueryEscaping() throws URIException {
845:                UURI uuri = UURIFactory
846:                        .getInstance("http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'\";:/?.>,<");
847:                assertEquals(
848:                        // tests in FF1.5 indicate it only escapes " < > 
849:                        "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'%22;:/?.%3E,%3C",
850:                        uuri.toString());
851:            }
852:
853:            /**
854:             * Check that our 'normalization' does same as Nutch's
855:             * Below before-and-afters were taken from the nutch urlnormalizer-basic
856:             * TestBasicURLNormalizer class  (December 2006, Nutch 0.9-dev).
857:             * @throws URIException
858:             */
859:            public void testSameAsNutchURLFilterBasic() throws URIException {
860:                assertEquals(UURIFactory.getInstance(" http://foo.com/ ")
861:                        .toString(), "http://foo.com/");
862:
863:                // check that protocol is lower cased
864:                assertEquals(UURIFactory.getInstance("HTTP://foo.com/")
865:                        .toString(), "http://foo.com/");
866:
867:                // check that host is lower cased
868:                assertEquals(UURIFactory.getInstance(
869:                        "http://Foo.Com/index.html").toString(),
870:                        "http://foo.com/index.html");
871:                assertEquals(UURIFactory.getInstance(
872:                        "http://Foo.Com/index.html").toString(),
873:                        "http://foo.com/index.html");
874:
875:                // check that port number is normalized
876:                assertEquals(UURIFactory.getInstance(
877:                        "http://foo.com:80/index.html").toString(),
878:                        "http://foo.com/index.html");
879:                assertEquals(UURIFactory.getInstance("http://foo.com:81/")
880:                        .toString(), "http://foo.com:81/");
881:
882:                // check that null path is normalized
883:                assertEquals(UURIFactory.getInstance("http://foo.com")
884:                        .toString(), "http://foo.com/");
885:
886:                // check that references are removed
887:                assertEquals(UURIFactory.getInstance(
888:                        "http://foo.com/foo.html#ref").toString(),
889:                        "http://foo.com/foo.html");
890:
891:                //     // check that encoding is normalized
892:                //     normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
893:
894:                // check that unnecessary "../" are removed
895:                assertEquals(UURIFactory.getInstance("http://foo.com/aa/../")
896:                        .toString(), "http://foo.com/");
897:                assertEquals(UURIFactory
898:                        .getInstance("http://foo.com/aa/bb/../").toString(),
899:                        "http://foo.com/aa/");
900:
901:                /* We fail this one.  Here we produce: 'http://foo.com/'.
902:                assertEquals(UURIFactory.
903:                        getInstance("http://foo.com/aa/..").toString(),
904:                    "http://foo.com/aa/..");
905:                 */
906:
907:                assertEquals(UURIFactory.getInstance(
908:                        "http://foo.com/aa/bb/cc/../../foo.html").toString(),
909:                        "http://foo.com/aa/foo.html");
910:                assertEquals(UURIFactory.getInstance(
911:                        "http://foo.com/aa/bb/../cc/dd/../ee/foo.html")
912:                        .toString(), "http://foo.com/aa/cc/ee/foo.html");
913:                assertEquals(UURIFactory.getInstance(
914:                        "http://foo.com/../foo.html").toString(),
915:                        "http://foo.com/foo.html");
916:                assertEquals(UURIFactory.getInstance(
917:                        "http://foo.com/../../foo.html").toString(),
918:                        "http://foo.com/foo.html");
919:                assertEquals(UURIFactory.getInstance(
920:                        "http://foo.com/../aa/../foo.html").toString(),
921:                        "http://foo.com/foo.html");
922:                assertEquals(UURIFactory.getInstance(
923:                        "http://foo.com/aa/../../foo.html").toString(),
924:                        "http://foo.com/foo.html");
925:                assertEquals(UURIFactory.getInstance(
926:                        "http://foo.com/aa/../bb/../foo.html/../../")
927:                        .toString(), "http://foo.com/");
928:                assertEquals(UURIFactory.getInstance(
929:                        "http://foo.com/../aa/foo.html").toString(),
930:                        "http://foo.com/aa/foo.html");
931:                assertEquals(UURIFactory.getInstance(
932:                        "http://foo.com/../aa/../foo.html").toString(),
933:                        "http://foo.com/foo.html");
934:                assertEquals(UURIFactory.getInstance(
935:                        "http://foo.com/a..a/foo.html").toString(),
936:                        "http://foo.com/a..a/foo.html");
937:                assertEquals(UURIFactory.getInstance(
938:                        "http://foo.com/a..a/../foo.html").toString(),
939:                        "http://foo.com/foo.html");
940:                assertEquals(UURIFactory.getInstance(
941:                        "http://foo.com/foo.foo/../foo.html").toString(),
942:                        "http://foo.com/foo.html");
943:            }
944:
945:            public void testHttpSchemeColonSlash() {
946:                boolean exception = false;
947:                try {
948:                    UURIFactory.getInstance("https:/");
949:                } catch (URIException e) {
950:                    exception = true;
951:                }
952:                assertTrue("Didn't throw exception when one expected",
953:                        exception);
954:                exception = false;
955:                try {
956:                    UURIFactory.getInstance("http://");
957:                } catch (URIException e) {
958:                    exception = true;
959:                }
960:                assertTrue("Didn't throw exception when one expected",
961:                        exception);
962:            }
963:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.