001: /* UURIFactoryTest
002: *
003: * $Id: UURIFactoryTest.java 5106 2007-05-01 00:07:29Z gojomo $
004: *
005: * Created on Apr 2, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
026: package org.archive.net;
028: import java.util.Iterator;
029: import java.util.TreeMap;
031: import junit.framework.TestCase;
033: import org.apache.commons.httpclient.URIException;
035: /**
036: * Test UURIFactory for proper UURI creation across variety of
037: * important/tricky cases.
038: *
039: * Be careful writing this file. Make sure you write it with UTF-8 encoding.
040: *
041: * @author igor stack gojomo
042: */
043: public class UURIFactoryTest extends TestCase {
045: public final void testEscaping() throws URIException {
046: // Note: single quote is not being escaped by URI class.
047: final String ESCAPED_URISTR = "http://archive.org/"
056: + "a.gif"; // NBSP and SPACE should be trimmed;
058: final String URISTR = "http://archive.org/.././" + "\u00A0"
060: + UURIFactory.QUOT + UURIFactory.SQUOT
064: + "test/../a.gif" + "\u00A0" + UURIFactory.SPACE;
066: UURI uuri = UURIFactory.getInstance(URISTR);
067: final String uuriStr = uuri.toString();
068: assertEquals("expected escaping", ESCAPED_URISTR, uuriStr);
069: }
071: public final void testUnderscoreMakesPortParseFail()
072: throws URIException {
073: UURI uuri = UURIFactory
074: .getInstance("http://one-two_three:8080/index.html");
075: int port = uuri.getPort();
076: assertTrue("Failed find of port " + uuri, port == 8080);
077: }
079: public final void testRelativeURIWithTwoSlashes()
080: throws URIException {
081: UURI base = UURIFactory.getInstance("http://www.archive.org");
082: UURI uuri = UURIFactory.getInstance(base, "one//index.html");
083: assertTrue("Doesn't do right thing with two slashes " + uuri,
084: uuri.toString().equals(
085: "http://www.archive.org/one//index.html"));
086: }
088: public final void testTrailingEncodedSpace() throws URIException {
089: UURI uuri = UURIFactory
090: .getInstance("http://www.nps-shoes.co.uk%20");
091: assertTrue("Doesn't strip trailing encoded space 1 " + uuri,
092: uuri.toString().equals("http://www.nps-shoes.co.uk/"));
093: uuri = UURIFactory
094: .getInstance("http://www.nps-shoes.co.uk%20%20%20");
095: assertTrue("Doesn't strip trailing encoded space 2 " + uuri,
096: uuri.toString().equals("http://www.nps-shoes.co.uk/"));
097: }
099: public final void testPort0080is80() throws URIException {
100: UURI uuri = UURIFactory.getInstance("http://archive.org:0080");
101: assertTrue("Doesn't strip leading zeros " + uuri, uuri
102: .toString().equals("http://archive.org/"));
103: }
106: // the problematic input given -- specifically the "%6s" incomplete uri-escape,
107: // shouldn't necessarily be rejected as a bad URI. IE and Firefox, at least,
108: // will attempt to fetch such an URL (getting, in this case against that ad
109: // server, a bad-request error). Ideally, we'd generate exactly the same
110: // request against the server as they do. However, with the most recent
111: // fixup for stray '%' signs, we come close, but not exactly. That's enough
112: // to cause this test to fail (it's not getting the expected exception) but
113: // our almost-URI, which might be what was intended, is better than trying
114: // nothing.
115: // public final void testBadPath() {
116: // String message = null;
117: // try {
118: // UURIFactory.getInstance("http://ads.as4x.tmcs.net/" +
119: // "html.ng/site=cs&pagepos=102&page=home&adsize=1x1&context=" +
120: // "generic&Params.richmedia=yes%26city%3Dseattle%26" +
121: // "rstid%3D2415%26market_id%3D86%26brand%3Dcitysearch" +
122: // "%6state%3DWA");
123: // } catch (URIException e) {
124: // message = e.getMessage();
125: // }
126: // assertNotNull("Didn't get expected exception.", message);
127: // }
129: public final void testEscapeEncoding() throws URIException {
130: UURI uuri = UURIFactory.getInstance("http://www.y1y1.com/"
131: + "albums/userpics/11111/normal_%E3%E4%EC%EC%EC.jpg",
132: "windows-1256");
133: uuri.getPath();
134: }
136: public final void testTooLongAfterEscaping() {
137: StringBuffer buffer = new StringBuffer(
138: "http://www.archive.org/a/");
139: // Append bunch of spaces. When escaped, they'll triple in size.
140: for (int i = 0; i < 1024; i++) {
141: buffer.append(" ");
142: }
143: buffer.append("/index.html");
144: String message = null;
145: try {
146: UURIFactory.getInstance(buffer.toString());
147: } catch (URIException e) {
148: message = e.getMessage();
149: }
150: assertTrue("Wrong or no exception: " + message,
151: (message != null)
152: && message
153: .startsWith("Created (escaped) uuri >"));
154: }
156: public final void testFtpUris() throws URIException {
157: final String FTP = "ftp";
158: final String AUTHORITY = "pfbuser:pfbuser@mprsrv.agri.gov.cn";
159: final String PATH = "/clzreceive/";
160: final String uri = FTP + "://" + AUTHORITY + PATH;
161: UURI uuri = UURIFactory.getInstance(uri);
162: assertTrue(
163: "Failed to get matching scheme: " + uuri.getScheme(),
164: (uuri.getScheme()).equals(FTP));
165: assertTrue("Failed to get matching authority: "
166: + uuri.getAuthority(), (uuri.getAuthority())
167: .equals(AUTHORITY));
168: assertTrue("Failed to get matching path: " + uuri.getPath(),
169: (uuri.getPath()).equals(PATH));
170: }
172: public final void testWhitespaceEscaped() throws URIException {
173: // Test that we get all whitespace even if the uri is
174: // already escaped.
175: String uri = "http://archive.org/index%25 .html";
176: String tgtUri = "http://archive.org/index%25%20.html";
177: UURI uuri = UURIFactory.getInstance(uri);
178: assertTrue("Not equal " + uuri.toString(), uuri.toString()
179: .equals(tgtUri));
180: uri = "http://archive.org/index%25\u001D.html";
181: tgtUri = "http://archive.org/index%25%1D.html".toLowerCase();
182: uuri = UURIFactory.getInstance(uri);
183: assertEquals("whitespace escaping", tgtUri, uuri.toString());
184: uri = "http://gemini.info.usaid.gov/directory/"
185: + "pbResults.cfm?&urlNameLast=Rumplestiltskin";
186: tgtUri = "http://gemini.info.usaid.gov/directory/faxResults.cfm?"
187: + "name=Ebenezer%20+Rumplestiltskin,&location=RRB%20%20%20%205%2E08%2D006";
188: uuri = UURIFactory.getInstance(UURIFactory.getInstance(uri),
189: "faxResults.cfm?name=Ebenezer +Rumplestiltskin,&location="
190: + "RRB%20%20%20%205%2E08%2D006");
191: assertEquals("whitespace escaping", tgtUri, uuri.toString());
192: }
194: // public final void testFailedGetPath() throws URIException {
195: // final String path = "/RealMedia/ads/" +
196: // "click_lx.ads/%%PAGE%%/%%RAND%%/%%POS%%/%%CAMP%%/empty";
197: // // decoding in getPath will interpret %CA as 8-bit escaped char,
198: // // possibly incomplete
199: // final String uri = "http://ads.nandomedia.com" + path;
200: // final UURI uuri = UURIFactory.getInstance(uri);
201: // String foundPath = uuri.getPath();
202: // assertEquals("unexpected path", path, foundPath);
203: // }
205: public final void testDnsHost() throws URIException {
206: String uri = "dns://ads.nandomedia.com:81/one.html";
207: UURI uuri = UURIFactory.getInstance(uri);
208: String host = uuri.getReferencedHost();
209: assertTrue("Host is wrong " + host, host
210: .equals("ads.nandomedia.com"));
211: uri = "dns:ads.nandomedia.com";
212: uuri = UURIFactory.getInstance(uri);
213: host = uuri.getReferencedHost();
214: assertTrue("Host is wrong " + host, host
215: .equals("ads.nandomedia.com"));
216: uri = "dns:ads.nandomedia.com?a=b";
217: uuri = UURIFactory.getInstance(uri);
218: host = uuri.getReferencedHost();
219: assertTrue("Host is wrong " + host, host
220: .equals("ads.nandomedia.com"));
221: }
223: public final void testPercentEscaping() throws URIException {
224: final String uri = "http://archive.org/%a%%%%%.html";
225: // tests indicate firefox (1.0.6) does not encode '%' at all
226: final String tgtUri = "http://archive.org/%a%%%%%.html";
227: UURI uuri = UURIFactory.getInstance(uri);
228: assertEquals("Not equal", tgtUri, uuri.toString());
229: }
231: public final void testRelativeDblPathSlashes() throws URIException {
232: UURI base = UURIFactory
233: .getInstance("http://www.archive.org/index.html");
234: UURI uuri = UURIFactory.getInstance(base,
236: assertTrue("Double slash not working " + uuri.toString(), uuri
237: .getPath().equals("/JIGOU//KYC//INDEX.HTM"));
238: }
240: public final void testRelativeWithScheme() throws URIException {
241: UURI base = UURIFactory
242: .getInstance("http://www.example.com/some/page");
243: UURI uuri = UURIFactory.getInstance(base, "http:boo");
244: assertTrue("Relative with scheme not working "
245: + uuri.toString(), uuri.toString().equals(
246: "http://www.example.com/some/boo"));
247: }
249: public final void testBadBaseResolve() throws URIException {
250: UURI base = UURIFactory
251: .getInstance("http://license.joins.com/board/"
252: + "etc_board_list.asp?board_name=new_main&b_type=&nPage="
253: + "2&category=G&lic_id=70&site=changeup&g_page=changeup&g_sPage="
254: + "notice&gate=02");
255: UURIFactory.getInstance(base, "http://www.changeup.com/...</a");
256: }
258: public final void testTilde() throws URIException {
259: noChangeExpected("http://license.joins.com/~igor");
260: }
262: public final void testCurlies() throws URIException {
263: // Firefox allows curlies in the query string portion of a URL only
264: // (converts curlies if they are in the path portion ahead of the
265: // query string).
266: UURI uuri = noChangeExpected("http://license.joins.com/igor?one={curly}");
267: assertEquals(uuri.getQuery(), "one={curly}");
268: assertEquals(
269: UURIFactory.getInstance(
270: "http://license.joins.com/igor{curly}.html")
271: .toString(),
272: "http://license.joins.com/igor%7Bcurly%7D.html");
273: boolean exception = false;
274: try {
275: UURIFactory
276: .getInstance("http://license.{curly}.com/igor.html");
277: } catch (URIException u) {
278: exception = true;
279: }
280: assertTrue("Did not get exception.", exception);
281: }
283: protected UURI noChangeExpected(final String original)
284: throws URIException {
285: UURI uuri = UURIFactory.getInstance(original);
286: assertEquals(original, uuri.toString());
287: return uuri;
288: }
290: public final void testTrimSpaceNBSP() throws URIException {
291: final String uri = " http://archive.org/DIR WITH SPACES/"
292: + UURIFactory.NBSP + "home.html " + UURIFactory.NBSP
293: + " ";
294: final String tgtUri = "http://archive.org/DIR%20WITH%20SPACES/%20home.html";
295: UURI uuri = UURIFactory.getInstance(uri);
296: assertTrue("Not equal " + uuri.toString(), uuri.toString()
297: .equals(tgtUri));
298: }
300: /**
301: * Test space plus encoding ([ 1010966 ] crawl.log has URIs with spaces in them).
302: * See <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1010966&group_id=73833&atid=539099">[ 1010966 ] crawl.log has URIs with spaces in them</a>.
303: * @throws URIException
304: */
305: public final void testSpaceDoubleEncoding() throws URIException {
306: final String uri = "http://www.brook.edu/i.html? %20taxonomy=Politics";
307: final String encodedUri = "http://www.brook.edu/i.html?%20%20taxonomy=Politics";
308: UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
309: assertTrue("Not equal " + uuri.toString(), uuri.toString()
310: .equals(encodedUri));
311: }
313: /**
314: * Test for doubly-encoded sequences.
315: * See <a href="https://sourceforge.net/tracker/index.php?func=detail&aid=966219&group_id=73833&atid=539099">[ 966219 ] UURI doubly-encodes %XX sequences</a>.
316: * @throws URIException
317: */
318: public final void testDoubleEncoding() throws URIException {
319: final char ae = '\u00E6';
320: final String uri = "http://archive.org/DIR WITH SPACES/home"
321: + ae + ".html";
322: final String encodedUri = "http://archive.org/DIR%20WITH%20SPACES/home%E6.html";
323: UURI uuri = UURIFactory.getInstance(uri, "ISO-8859-1");
324: assertEquals("single encoding", encodedUri, uuri.toString());
325: // Dbl-encodes.
326: uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
327: uuri = UURIFactory.getInstance(uuri.toString(), "ISO-8859-1");
328: assertEquals("double encoding", encodedUri, uuri.toString());
329: // Do default utf-8 test.
330: uuri = UURIFactory.getInstance(uri);
331: final String encodedUtf8Uri = "http://archive.org/DIR%20WITH%20SPACES/home%C3%A6.html";
332: assertEquals("Not equal utf8", encodedUtf8Uri, uuri.toString());
333: // Now dbl-encode.
334: uuri = UURIFactory.getInstance(uuri.toString());
335: uuri = UURIFactory.getInstance(uuri.toString());
336: assertEquals("Not equal (dbl-encoding) utf8", encodedUtf8Uri,
337: uuri.toString());
338: }
340: /**
341: * Test for syntax errors stop page parsing.
342: * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ] URI Syntax Errors stop page parsing</a>
343: * @throws URIException
344: */
345: public final void testThreeSlashes() throws URIException {
346: UURI goodURI = UURIFactory
347: .getInstance("http://lcweb.loc.gov/rr/goodtwo.html");
348: String uuri = "http:///lcweb.loc.gov/rr/goodtwo.html";
349: UURI rewrittenURI = UURIFactory.getInstance(uuri);
350: assertTrue("Not equal " + goodURI + ", " + uuri, goodURI
351: .toString().equals(rewrittenURI.toString()));
352: uuri = "http:////lcweb.loc.gov/rr/goodtwo.html";
353: rewrittenURI = UURIFactory.getInstance(uuri);
354: assertTrue("Not equal " + goodURI + ", " + uuri, goodURI
355: .toString().equals(rewrittenURI.toString()));
356: // Check https.
357: goodURI = UURIFactory
358: .getInstance("https://lcweb.loc.gov/rr/goodtwo.html");
359: uuri = "https:////lcweb.loc.gov/rr/goodtwo.html";
360: rewrittenURI = UURIFactory.getInstance(uuri);
361: assertTrue("Not equal " + goodURI + ", " + uuri, goodURI
362: .toString().equals(rewrittenURI.toString()));
363: }
365: public final void testNoScheme() {
366: boolean expectedException = false;
367: String uuri = "www.loc.gov/rr/european/egw/polishex.html";
368: try {
369: UURIFactory.getInstance(uuri);
370: } catch (URIException e) {
371: // Expected exception.
372: expectedException = true;
373: }
374: assertTrue("Didn't get expected exception: " + uuri,
375: expectedException);
376: }
378: public final void testRelative() throws URIException {
379: UURI uuriTgt = UURIFactory
380: .getInstance("http://archive.org:83/home.html");
381: UURI uri = UURIFactory
382: .getInstance("http://archive.org:83/one/two/three.html");
383: UURI uuri = UURIFactory.getInstance(uri, "/home.html");
384: assertTrue("Not equal", uuriTgt.toString().equals(
385: uuri.toString()));
386: }
388: /**
389: * Test that an empty uuri does the right thing -- that we get back the
390: * base.
391: *
392: * @throws URIException
393: */
394: public final void testRelativeEmpty() throws URIException {
395: UURI uuriTgt = UURIFactory
396: .getInstance("http://archive.org:83/one/two/three.html");
397: UURI uri = UURIFactory
398: .getInstance("http://archive.org:83/one/two/three.html");
399: UURI uuri = UURIFactory.getInstance(uri, "");
400: assertTrue("Empty length don't work", uuriTgt.toString()
401: .equals(uuri.toString()));
402: }
404: public final void testAbsolute() throws URIException {
405: UURI uuriTgt = UURIFactory
406: .getInstance("http://archive.org:83/home.html");
407: UURI uri = UURIFactory
408: .getInstance("http://archive.org:83/one/two/three.html");
409: UURI uuri = UURIFactory.getInstance(uri,
410: "http://archive.org:83/home.html");
411: assertTrue("Not equal", uuriTgt.toString().equals(
412: uuri.toString()));
413: }
415: /**
416: * Test for [ 962892 ] UURI accepting/creating unUsable URIs (bad hosts).
417: * @see <a href="https://sourceforge.net/tracker/?func=detail&atid=539099&aid=962892&group_id=73833">[ 962892 ] UURI accepting/creating unUsable URIs (bad hosts)</a>
418: */
419: public final void testHostWithLessThan() {
420: checkExceptionOnIllegalDomainlabel("http://www.betamobile.com</A");
421: checkExceptionOnIllegalDomainlabel("http://C|/unzipped/426/spacer.gif");
422: checkExceptionOnIllegalDomainlabel("http://www.lycos.co.uk\"/l/b/\"");
423: }
425: /**
426: * Test for [ 1012520 ] UURI.length() > 2k.
427: * @throws URIException
428: * @see <a href="http://sourceforge.net/tracker/index.php?func=detail&aid=1012520&group_id=73833&atid=539099">[ 1012520 ] UURI.length() > 2k</a>
429: */
430: public final void test2kURI() throws URIException {
431: final StringBuffer buffer = new StringBuffer("http://a.b");
432: final String subPath = "/123456789";
433: for (int i = 0; i < 207; i++) {
434: buffer.append(subPath);
435: }
436: // String should be 2080 characters long. Legal.
437: UURIFactory.getInstance(buffer.toString());
438: boolean gotException = false;
439: // Add ten more characters and make size illegal.
440: buffer.append(subPath);
441: try {
442: UURIFactory.getInstance(buffer.toString());
443: } catch (URIException e) {
444: gotException = true;
445: }
446: assertTrue("No expected exception complaining about long URI",
447: gotException);
448: }
450: private void checkExceptionOnIllegalDomainlabel(String uuri) {
451: boolean expectedException = false;
452: try {
453: UURIFactory.getInstance(uuri);
454: } catch (URIException e) {
455: // Expected exception.
456: expectedException = true;
457: }
458: assertTrue("Didn't get expected exception: " + uuri,
459: expectedException);
460: }
462: /**
463: * Test for doing separate DNS lookup for same host
464: *
465: * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=788277&group_id=73833&atid=539099">[ 788277 ] Doing separate DNS lookup for same host</a>
466: * @throws URIException
467: */
468: public final void testHostWithPeriod() throws URIException {
469: UURI uuri1 = UURIFactory
470: .getInstance("http://www.loc.gov./index.html");
471: UURI uuri2 = UURIFactory
472: .getInstance("http://www.loc.gov/index.html");
473: assertEquals("Failed equating hosts with dot", uuri1.getHost(),
474: uuri2.getHost());
475: }
477: /**
478: * Test for NPE in java.net.URI.encode
479: *
480: * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=874220&group_id=73833&atid=539099">[ 874220 ] NPE in java.net.URI.encode</a>
481: * @throws URIException
482: */
483: public final void testHostEncodedChars() throws URIException {
484: String s = "http://g.msn.co.kr/0nwkokr0/00/19??"
485: + "PS=10274&NC=10009&CE=42&CP=949&HL="
486: + "���?��";
487: assertNotNull("Encoded chars " + s, UURIFactory.getInstance(s));
488: }
490: /**
491: * Test for java.net.URI parses %20 but getHost null
492: *
493: * See <a href="https://sourceforge.net/tracker/?func=detail&aid=927940&group_id=73833&atid=539099">[ 927940 ] java.net.URI parses %20 but getHost null</a>
494: */
495: public final void testSpaceInHost() {
496: boolean expectedException = false;
497: try {
498: UURIFactory
499: .getInstance("http://www.local-regions.odpm%20.gov.uk"
500: + "/lpsa/challenge/pdf/propect.pdf");
501: } catch (URIException e) {
502: expectedException = true;
503: }
504: assertTrue("Did not fail with escaped space.",
505: expectedException);
507: expectedException = false;
508: try {
509: UURIFactory
510: .getInstance("http://www.local-regions.odpm .gov.uk"
511: + "/lpsa/challenge/pdf/propect.pdf");
512: } catch (URIException e) {
513: expectedException = true;
514: }
515: assertTrue("Did not fail with real space.", expectedException);
516: }
518: /**
519: * Test for java.net.URI chokes on hosts_with_underscores.
520: *
521: * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=808270&group_id=73833&atid=539099">[ 808270 ] java.net.URI chokes on hosts_with_underscores</a>
522: * @throws URIException
523: */
524: public final void testHostWithUnderscores() throws URIException {
525: UURI uuri = UURIFactory
526: .getInstance("http://x_underscore_underscore.2u.com.tw/nonexistent_page.html");
527: assertEquals("Failed get of host with underscore",
528: "x_underscore_underscore.2u.com.tw", uuri.getHost());
529: }
531: /**
532: * Two dots for igor.
533: */
534: public final void testTwoDots() {
535: boolean expectedException = false;
536: try {
537: UURIFactory
538: .getInstance("http://x_underscore_underscore..2u.com/nonexistent_page.html");
539: } catch (URIException e) {
540: expectedException = true;
541: }
542: assertTrue("Two dots did not throw exception",
543: expectedException);
544: }
546: /**
547: * Test for java.net.URI#getHost fails when leading digit.
548: *
549: * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=910120&group_id=73833&atid=539099">[ 910120 ] java.net.URI#getHost fails when leading digit.</a>
550: * @throws URIException
551: */
552: public final void testHostWithDigit() throws URIException {
553: UURI uuri = UURIFactory
554: .getInstance("http://0204chat.2u.com.tw/nonexistent_page.html");
555: assertEquals("Failed get of host with digit",
556: "0204chat.2u.com.tw", uuri.getHost());
557: }
559: /**
560: * Test for Constraining java URI class.
561: *
562: * @see <a href="https://sourceforge.net/tracker/?func=detail&aid=949548&group_id=73833&atid=539099">[ 949548 ] Constraining java URI class</a>
563: */
564: public final void testPort() {
565: checkBadPort("http://www.tyopaikat.com:a/robots.txt");
566: checkBadPort("");
567: checkBadPort("http://pdb.rutgers.edu:81.rutgers.edu/robots.txt");
568: checkBadPort("https://webmail.gse.harvard.edu:9100robots.txt/robots.txt");
569: checkBadPort("https://webmail.gse.harvard.edu:0/robots.txt/robots.txt");
570: }
572: /**
573: * Test bad port throws exception.
574: * @param uri URI with bad port to check.
575: */
576: private void checkBadPort(String uri) {
577: boolean exception = false;
578: try {
579: UURIFactory.getInstance(uri);
580: } catch (URIException e) {
581: exception = true;
582: }
583: assertTrue("Didn't throw exception: " + uri, exception);
584: }
586: /**
587: * Preserve userinfo capitalization.
588: * @throws URIException
589: */
590: public final void testUserinfo() throws URIException {
591: final String authority = "stack:StAcK@www.tyopaikat.com";
592: final String uri = "http://" + authority + "/robots.txt";
593: UURI uuri = UURIFactory.getInstance(uri);
594: assertEquals("Authority not equal", uuri.getAuthority(),
595: authority);
596: /*
597: String tmp = uuri.toString();
598: assertTrue("URI not equal", tmp.equals(uri));
599: */
600: }
602: /**
603: * Test user info + port
604: * @throws URIException
605: */
606: public final void testUserinfoPlusPort() throws URIException {
607: final String userInfo = "stack:StAcK";
608: final String authority = "www.tyopaikat.com";
609: final int port = 8080;
610: final String uri = "http://" + userInfo + "@" + authority + ":"
611: + port + "/robots.txt";
612: UURI uuri = UURIFactory.getInstance(uri);
613: assertEquals("Host not equal", authority, uuri.getHost());
614: assertEquals("Userinfo Not equal", userInfo, uuri.getUserinfo());
615: assertEquals("Port not equal", port, uuri.getPort());
616: assertEquals("Authority wrong",
617: "stack:StAcK@www.tyopaikat.com:8080", uuri
618: .getAuthority());
619: assertEquals("AuthorityMinusUserinfo wrong",
620: "www.tyopaikat.com:8080", uuri
621: .getAuthorityMinusUserinfo());
623: }
625: /**
626: * Tests from rfc2396 with amendments to accomodate differences
627: * intentionally added to make our URI handling like IEs.
628: *
629: * <pre>
630: * g:h = g:h
631: * g = http://a/b/c/g
632: * ./g = http://a/b/c/g
633: * g/ = http://a/b/c/g/
634: * /g = http://a/g
635: * //g = http://g
636: * ?y = http://a/b/c/?y
637: * g?y = http://a/b/c/g?y
638: * #s = (current document)#s
639: * g#s = http://a/b/c/g#s
640: * g?y#s = http://a/b/c/g?y#s
641: * ;x = http://a/b/c/;x
642: * g;x = http://a/b/c/g;x
643: * g;x?y#s = http://a/b/c/g;x?y#s
644: * . = http://a/b/c/
645: * ./ = http://a/b/c/
646: * .. = http://a/b/
647: * ../ = http://a/b/
648: * ../g = http://a/b/g
649: * ../.. = http://a/
650: * ../../ = http://a/
651: * ../../g = http://a/g
652: * </pre>
653: *
654: * @throws URIException
655: */
656: public final void testRFC2396Relative() throws URIException {
657: UURI base = UURIFactory.getInstance("http://a/b/c/d;p?q");
658: TreeMap<String, String> m = new TreeMap<String, String>();
659: m.put("..", "http://a/b/");
660: m.put("../", "http://a/b/");
661: m.put("../g", "http://a/b/g");
662: m.put("../..", "http://a/");
663: m.put("../../", "http://a/");
664: m.put("../../g", "http://a/g");
665: m.put("g#s", "http://a/b/c/g#s");
666: m.put("g?y#s ", "http://a/b/c/g?y#s");
667: m.put(";x", "http://a/b/c/;x");
668: m.put("g;x", "http://a/b/c/g;x");
669: m.put("g;x?y#s", "http://a/b/c/g;x?y#s");
670: m.put(".", "http://a/b/c/");
671: m.put("./", "http://a/b/c/");
672: m.put("g", "http://a/b/c/g");
673: m.put("./g", "http://a/b/c/g");
674: m.put("g/", "http://a/b/c/g/");
675: m.put("/g", "http://a/g");
676: m.put("//g", "http://g");
677: m.put("?y", "http://a/b/c/?y");
678: m.put("g?y", "http://a/b/c/g?y");
679: // EXTRAS beyond the RFC set.
680: // TODO: That these resolve to a path of /a/g might be wrong. Perhaps
681: // it should be '/g'?.
682: m.put("/../../../../../../../../g", "http://a/g");
683: m.put("../../../../../../../../g", "http://a/g");
684: m.put("../G", "http://a/b/G");
685: for (Iterator i = m.keySet().iterator(); i.hasNext();) {
686: String key = (String) i.next();
687: String value = (String) m.get(key);
688: UURI uuri = UURIFactory.getInstance(base, key);
689: assertTrue("Unexpected " + key + " " + value + " " + uuri,
690: uuri.equals(UURIFactory.getInstance(value)));
691: }
692: }
694: /**
695: * A UURI should always be without a 'fragment' segment, which is
696: * unused and irrelevant for network fetches.
697: *
698: * See [ 970666 ] #anchor links not trimmed, and thus recrawled
699: *
700: * @throws URIException
701: */
702: public final void testAnchors() throws URIException {
703: UURI uuri = UURIFactory
704: .getInstance("http://www.example.com/path?query#anchor");
705: assertEquals("Not equal", "http://www.example.com/path?query",
706: uuri.toString());
707: }
709: /**
710: * Ensure that URI strings beginning with a colon are treated
711: * the same as browsers do (as relative, rather than as absolute
712: * with zero-length scheme).
713: *
714: * @throws URIException
715: */
716: public void testStartsWithColon() throws URIException {
717: UURI base = UURIFactory
718: .getInstance("http://www.example.com/path/page");
719: UURI uuri = UURIFactory.getInstance(base, ":foo");
720: assertEquals("derelativize starsWithColon", uuri.getURI(),
721: "http://www.example.com/path/:foo");
722: }
724: /**
725: * Ensure that stray trailing '%' characters do not prevent
726: * UURI instances from being created, and are reasonably
727: * escaped when encountered.
728: *
729: * @throws URIException
730: */
731: public void testTrailingPercents() throws URIException {
732: String plainPath = "http://www.example.com/path%";
733: UURI plainPathUuri = UURIFactory.getInstance(plainPath);
734: assertEquals("plainPath getURI", plainPath, plainPathUuri
735: .getURI());
736: assertEquals("plainPath getEscapedURI",
737: "http://www.example.com/path%", // browsers don't escape '%'
738: plainPathUuri.getEscapedURI());
740: String partiallyEscapedPath = "http://www.example.com/pa%20th%";
741: UURI partiallyEscapedPathUuri = UURIFactory
742: .getInstance(partiallyEscapedPath);
743: // assertEquals("partiallyEscapedPath getURI",
744: // "http://www.example.com/pa th%", // TODO: is this desirable?
745: //// partiallyEscapedPath,
746: // partiallyEscapedPathUuri.getURI());
747: assertEquals("partiallyEscapedPath getEscapedURI",
748: "http://www.example.com/pa%20th%",
749: partiallyEscapedPathUuri.getEscapedURI());
751: String plainQueryString = "http://www.example.com/path?q=foo%";
752: UURI plainQueryStringUuri = UURIFactory
753: .getInstance(plainQueryString);
754: // assertEquals("plainQueryString getURI",
755: // plainQueryString,
756: // plainQueryStringUuri.getURI());
757: assertEquals("plainQueryString getEscapedURI",
758: "http://www.example.com/path?q=foo%",
759: plainQueryStringUuri.getEscapedURI());
761: String partiallyEscapedQueryString = "http://www.example.com/pa%20th?q=foo%";
762: UURI partiallyEscapedQueryStringUuri = UURIFactory
763: .getInstance(partiallyEscapedQueryString);
764: assertEquals("partiallyEscapedQueryString getURI",
765: "http://www.example.com/pa th?q=foo%",
766: partiallyEscapedQueryStringUuri.getURI());
767: assertEquals("partiallyEscapedQueryString getEscapedURI",
768: "http://www.example.com/pa%20th?q=foo%",
769: partiallyEscapedQueryStringUuri.getEscapedURI());
770: }
772: /**
773: * Ensure that stray '%' characters do not prevent
774: * UURI instances from being created, and are reasonably
775: * escaped when encountered.
776: *
777: * @throws URIException
778: */
779: public void testStrayPercents() throws URIException {
780: String oneStray = "http://www.example.com/pa%th";
781: UURI oneStrayUuri = UURIFactory.getInstance(oneStray);
782: assertEquals("oneStray getURI", oneStray, oneStrayUuri.getURI());
783: assertEquals("oneStray getEscapedURI",
784: "http://www.example.com/pa%th", // browsers don't escape '%'
785: oneStrayUuri.getEscapedURI());
787: String precededByValidEscape = "http://www.example.com/pa%20th%way";
788: UURI precededByValidEscapeUuri = UURIFactory
789: .getInstance(precededByValidEscape);
790: assertEquals("precededByValidEscape getURI",
791: "http://www.example.com/pa th%way", // getURI interprets escapes
792: precededByValidEscapeUuri.getURI());
793: assertEquals("precededByValidEscape getEscapedURI",
794: "http://www.example.com/pa%20th%way",
795: precededByValidEscapeUuri.getEscapedURI());
797: String followedByValidEscape = "http://www.example.com/pa%th%20way";
798: UURI followedByValidEscapeUuri = UURIFactory
799: .getInstance(followedByValidEscape);
800: assertEquals("followedByValidEscape getURI",
801: "http://www.example.com/pa%th way", // getURI interprets escapes
802: followedByValidEscapeUuri.getURI());
803: assertEquals("followedByValidEscape getEscapedURI",
804: "http://www.example.com/pa%th%20way",
805: followedByValidEscapeUuri.getEscapedURI());
806: }
808: public void testEscapingNotNecessary() throws URIException {
809: String escapesUnnecessary = "http://www.example.com/misc;reserved:chars@that&don't=need"
810: + "+escaping$even,though!you(might)initially?think#so";
811: // expect everything but the #fragment
812: String expected = escapesUnnecessary.substring(0,
813: escapesUnnecessary.length() - 3);
814: assertEquals("escapes unnecessary", expected, UURIFactory
815: .getInstance(escapesUnnecessary).toString());
816: }
818: public void testIdn() throws URIException {
819: // See http://www.josefsson.org/idn.php.
820: String idn1 = new String("http://räksmörgås.josefßon.org/");
821: String puny1 = "http://xn--rksmrgs-5wao1o.josefsson.org/";
822: assertEquals("encoding of " + idn1, puny1, UURIFactory
823: .getInstance(idn1).toString());
824: String idn2 = "http://www.pølse.dk/";
825: String puny2 = "http://www.xn--plse-gra.dk/";
826: assertEquals("encoding of " + idn2, puny2, UURIFactory
827: .getInstance(idn2).toString());
828: }
830: public void testNewLineInURL() throws URIException {
831: UURI uuri = UURIFactory.getInstance("http://www.ar\rchive\n."
832: + "org/i\n\n\r\rndex.html");
833: assertEquals("http://www.archive.org/index.html", uuri
834: .toString());
835: }
837: public void testTabsInURL() throws URIException {
838: UURI uuri = UURIFactory.getInstance("http://www.ar\tchive\t."
839: + "org/i\t\r\n\tndex.html");
840: assertEquals("http://www.archive.org/index.html", uuri
841: .toString());
842: }
844: public void testQueryEscaping() throws URIException {
845: UURI uuri = UURIFactory
846: .getInstance("http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'\";:/?.>,<");
847: assertEquals(
848: // tests in FF1.5 indicate it only escapes " < >
849: "http://www.yahoo.com/foo?somechars!@$%^&*()_-+={[}]|\'%22;:/?.%3E,%3C",
850: uuri.toString());
851: }
853: /**
854: * Check that our 'normalization' does same as Nutch's
855: * Below before-and-afters were taken from the nutch urlnormalizer-basic
856: * TestBasicURLNormalizer class (December 2006, Nutch 0.9-dev).
857: * @throws URIException
858: */
859: public void testSameAsNutchURLFilterBasic() throws URIException {
860: assertEquals(UURIFactory.getInstance(" http://foo.com/ ")
861: .toString(), "http://foo.com/");
863: // check that protocol is lower cased
864: assertEquals(UURIFactory.getInstance("HTTP://foo.com/")
865: .toString(), "http://foo.com/");
867: // check that host is lower cased
868: assertEquals(UURIFactory.getInstance(
869: "http://Foo.Com/index.html").toString(),
870: "http://foo.com/index.html");
871: assertEquals(UURIFactory.getInstance(
872: "http://Foo.Com/index.html").toString(),
873: "http://foo.com/index.html");
875: // check that port number is normalized
876: assertEquals(UURIFactory.getInstance(
877: "http://foo.com:80/index.html").toString(),
878: "http://foo.com/index.html");
879: assertEquals(UURIFactory.getInstance("http://foo.com:81/")
880: .toString(), "http://foo.com:81/");
882: // check that null path is normalized
883: assertEquals(UURIFactory.getInstance("http://foo.com")
884: .toString(), "http://foo.com/");
886: // check that references are removed
887: assertEquals(UURIFactory.getInstance(
888: "http://foo.com/foo.html#ref").toString(),
889: "http://foo.com/foo.html");
891: // // check that encoding is normalized
892: // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
894: // check that unnecessary "../" are removed
895: assertEquals(UURIFactory.getInstance("http://foo.com/aa/../")
896: .toString(), "http://foo.com/");
897: assertEquals(UURIFactory
898: .getInstance("http://foo.com/aa/bb/../").toString(),
899: "http://foo.com/aa/");
901: /* We fail this one. Here we produce: 'http://foo.com/'.
902: assertEquals(UURIFactory.
903: getInstance("http://foo.com/aa/..").toString(),
904: "http://foo.com/aa/..");
905: */
907: assertEquals(UURIFactory.getInstance(
908: "http://foo.com/aa/bb/cc/../../foo.html").toString(),
909: "http://foo.com/aa/foo.html");
910: assertEquals(UURIFactory.getInstance(
911: "http://foo.com/aa/bb/../cc/dd/../ee/foo.html")
912: .toString(), "http://foo.com/aa/cc/ee/foo.html");
913: assertEquals(UURIFactory.getInstance(
914: "http://foo.com/../foo.html").toString(),
915: "http://foo.com/foo.html");
916: assertEquals(UURIFactory.getInstance(
917: "http://foo.com/../../foo.html").toString(),
918: "http://foo.com/foo.html");
919: assertEquals(UURIFactory.getInstance(
920: "http://foo.com/../aa/../foo.html").toString(),
921: "http://foo.com/foo.html");
922: assertEquals(UURIFactory.getInstance(
923: "http://foo.com/aa/../../foo.html").toString(),
924: "http://foo.com/foo.html");
925: assertEquals(UURIFactory.getInstance(
926: "http://foo.com/aa/../bb/../foo.html/../../")
927: .toString(), "http://foo.com/");
928: assertEquals(UURIFactory.getInstance(
929: "http://foo.com/../aa/foo.html").toString(),
930: "http://foo.com/aa/foo.html");
931: assertEquals(UURIFactory.getInstance(
932: "http://foo.com/../aa/../foo.html").toString(),
933: "http://foo.com/foo.html");
934: assertEquals(UURIFactory.getInstance(
935: "http://foo.com/a..a/foo.html").toString(),
936: "http://foo.com/a..a/foo.html");
937: assertEquals(UURIFactory.getInstance(
938: "http://foo.com/a..a/../foo.html").toString(),
939: "http://foo.com/foo.html");
940: assertEquals(UURIFactory.getInstance(
941: "http://foo.com/foo.foo/../foo.html").toString(),
942: "http://foo.com/foo.html");
943: }
945: public void testHttpSchemeColonSlash() {
946: boolean exception = false;
947: try {
948: UURIFactory.getInstance("https:/");
949: } catch (URIException e) {
950: exception = true;
951: }
952: assertTrue("Didn't throw exception when one expected",
953: exception);
954: exception = false;
955: try {
956: UURIFactory.getInstance("http://");
957: } catch (URIException e) {
958: exception = true;
959: }
960: assertTrue("Didn't throw exception when one expected",
961: exception);
962: }
963: }