0001: // yacyURL.java
0002: // (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
0003: // first published 13.07.2006 on http://yacy.net
0004: //
0005: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
0006: // $LastChangedRevision: 1986 $
0007: // $LastChangedBy: orbiter $
0008: //
0009: // LICENSE
0010: //
0011: // This program is free software; you can redistribute it and/or modify
0012: // it under the terms of the GNU General Public License as published by
0013: // the Free Software Foundation; either version 2 of the License, or
0014: // (at your option) any later version.
0015: //
0016: // This program is distributed in the hope that it will be useful,
0017: // but WITHOUT ANY WARRANTY; without even the implied warranty of
0018: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0019: // GNU General Public License for more details.
0020: //
0021: // You should have received a copy of the GNU General Public License
0022: // along with this program; if not, write to the Free Software
0023: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0024:
0025: package de.anomic.yacy;
0026:
0027: // this class exsist to provide a system-wide normal form representation of urls,
0028: // and to prevent that java.net.URL usage causes DNS queries which are used in java.net.
0029:
0030: import java.io.File;
0031: import java.net.MalformedURLException;
0032: import java.util.HashMap;
0033: import java.util.Iterator;
0034: import java.util.TreeSet;
0035: import java.util.regex.Matcher;
0036: import java.util.regex.Pattern;
0037:
0038: import de.anomic.kelondro.kelondroBase64Order;
0039: import de.anomic.server.serverCodings;
0040: import de.anomic.server.serverDomains;
0041:
0042: public class yacyURL {
0043:
0044: // TLD separation in political and cultural parts
0045: // https://www.cia.gov/cia/publications/factbook/index.html
0046: // http://en.wikipedia.org/wiki/List_of_countries_by_continent
0047:
0048: private static final String[] TLD_NorthAmericaOceania = {
0049: // primary english-speaking countries
0050: // english-speaking countries from central america are also included
0051: // includes also dutch and french colonies in the caribbean sea
0052: // and US/English/Australian military bases in asia
0053: "EDU=US Educational",
0054: "GOV=US Government",
0055: "MIL=US Military",
0056: "NET=Network",
0057: "ORG=Non-Profit Organization",
0058: "AN=Netherlands Antilles",
0059: "AS=American Samoa",
0060: "AG=Antigua and Barbuda",
0061: "AI=Anguilla",
0062: "AU=Australia",
0063: "BB=Barbados",
0064: "BZ=Belize",
0065: "BM=Bermuda",
0066: "BS=Bahamas",
0067: "CA=Canada",
0068: "CC=Cocos (Keeling) Islands",
0069: "CK=Cook Islands",
0070: "CX=Christmas Island", // located in the Indian Ocean, but belongs to Australia
0071: "DM=Dominica",
0072: "FM=Micronesia",
0073: "FJ=Fiji",
0074: "GD=Grenada",
0075: "GP=Guadeloupe",
0076: "GS=South Georgia and the South Sandwich Islands", // south of south america, but administrated by british, has only a scientific base
0077: "GU=Guam", // strategical US basis close to Japan
0078: "HM=Heard and McDonald Islands", // uninhabited, sub-Antarctic island, owned by Australia
0079: "HT=Haiti",
0080: "IO=British Indian Ocean Territory", // UK-US naval support facility in the Indian Ocean
0081: "KI=Kiribati", // 33 coral atolls in the pacific, formerly owned by UK
0082: "KN=Saint Kitts and Nevis", // islands in the carribean see
0083: "KY=Cayman Islands",
0084: "LC=Saint Lucia",
0085: "MH=Marshall Islands", // formerly US atomic bomb test site, now a key installation in the US missile defense network
0086: "MP=Northern Mariana Islands", // US strategic location in the western Pacific Ocean
0087: "NC=New Caledonia",
0088: "NF=Norfolk Island",
0089: "NR=Nauru", // independent UN island
0090: "NU=Niue", // one of world's largest coral islands
0091: "NZ=New Zealand (Aotearoa)",
0092: "PG=Papua New Guinea",
0093: "PN=Pitcairn", // overseas territory of the UK
0094: "PR=Puerto Rico", // territory of the US with commonwealth status
0095: "PW=Palau", // was once governed by Micronesia
0096: "Sb=Solomon Islands",
0097: "TC=Turks and Caicos Islands", // overseas territory of the UK
0098: "TK=Tokelau", // group of three atolls in the South Pacific Ocean, british protectorat
0099: "TO=Tonga",
0100: "TT=Trinidad and Tobago",
0101: "TV=Tuvalu", // nine coral atolls in the South Pacific Ocean; in 2000, Tuvalu leased its TLD ".tv" for $50 million over a 12-year period
0102: "UM=US Minor Outlying Islands", // nine insular United States possessions in the Pacific Ocean and the Caribbean Sea
0103: "US=United States", "VC=Saint Vincent and the Grenadines",
0104: "VG=Virgin Islands (British)", "VI=Virgin Islands (U.S.)",
0105: "VU=Vanuatu", "WF=Wallis and Futuna Islands", "WS=Samoa" };
0106: private static final String[] TLD_MiddleSouthAmerica = {
0107: // primary spanish and portugese-speaking
0108: "AR=Argentina", "AW=Aruba", "BR=Brazil", "BO=Bolivia",
0109: "CL=Chile", "CO=Colombia", "CR=Costa Rica", "CU=Cuba",
0110: "DO=Dominican Republic", "EC=Ecuador",
0111: "FK=Falkland Islands (Malvinas)", "GF=French Guiana",
0112: "GT=Guatemala", "GY=Guyana", "HN=Honduras", "JM=Jamaica",
0113: "MX=Mexico", "NI=Nicaragua", "PA=Panama", "PE=Peru",
0114: "PY=Paraguay", "SR=Suriname", "SV=El Salvador",
0115: "UY=Uruguay", "VE=Venezuela" };
0116: private static final String[] TLD_EuropaRussia = {
0117: // includes also countries that are mainly french- dutch- speaking
0118: // and culturally close to europe
0119: "AD=Andorra",
0120: "AL=Albania",
0121: "AQ=Antarctica",
0122: "AT=Austria",
0123: "BA=Bosnia and Herzegovina",
0124: "BE=Belgium",
0125: "BG=Bulgaria",
0126: "BV=Bouvet Island", // this island is uninhabited and covered by ice, south of africa but governed by Norway
0127: "BY=Belarus",
0128: "CH=Switzerland",
0129: "CS=Czechoslovakia (former)",
0130: "CZ=Czech Republic",
0131: "CY=Cyprus",
0132: "DE=Germany",
0133: "DK=Denmark",
0134: "ES=Spain",
0135: "EE=Estonia",
0136: "FI=Finland",
0137: "FO=Faroe Islands", // Viking Settlers
0138: "FR=France", "FX=France, Metropolitan",
0139: "GB=Great Britain (UK)", "GI=Gibraltar", "GL=Greenland",
0140: "GR=Greece", "HR=Croatia (Hrvatska)", "HU=Hungary",
0141: "IE=Ireland", "IS=Iceland",
0142: "IT=Italy",
0143: "LI=Liechtenstein",
0144: "LT=Lithuania",
0145: "LU=Luxembourg",
0146: "LV=Latvia",
0147: "MD=Moldova",
0148: "MC=Monaco",
0149: "MK=Macedonia",
0150: "MN=Mongolia",
0151: "MS=Montserrat", // British island in the Caribbean Sea, almost not populated because of strong vulcanic activity
0152: "MT=Malta",
0153: "MQ=Martinique", // island in the eastern Caribbean Sea, overseas department of France
0154: "NATO=Nato field",
0155: "NL=Netherlands",
0156: "NO=Norway",
0157: "PF=French Polynesia", // French annexed Polynesian island in the South Pacific, French atomic bomb test site
0158: "PL=Poland",
0159: "PM=St. Pierre and Miquelon", // french-administrated colony close to canada, belongs to France
0160: "PT=Portugal", "RO=Romania", "RU=Russia",
0161: "SE=Sweden",
0162: "SI=Slovenia",
0163: "SJ=Svalbard and Jan Mayen Islands", // part of Norway
0164: "SM=San Marino", "SK=Slovak Republic",
0165: "SU=USSR (former)",
0166: "TF=French Southern Territories", // islands in the arctic see, no inhabitants
0167: "UK=United Kingdom", "UA=Ukraine",
0168: "VA=Vatican City State (Holy See)", "YU=Yugoslavia" };
0169:
0170: private static final String[] TLD_MiddleEastWestAsia = {
0171: // states that are influenced by islamic culture and arabic language
0172: // includes also eurasia states and those that had been part of the former USSR and close to southwest asia
0173: "AE=United Arab Emirates", "AF=Afghanistan", "AM=Armenia",
0174: "AZ=Azerbaijan", "BH=Bahrain", "GE=Georgia", "IL=Israel",
0175: "IQ=Iraq", "IR=Iran", "JO=Jordan", "KG=Kyrgyzstan",
0176: "KZ=Kazakhstan", "KW=Kuwait", "LB=Lebanon", "OM=Oman",
0177: "QA=Qatar", "SA=Saudi Arabia", "SY=Syria", "TJ=Tajikistan",
0178: "TM=Turkmenistan", "PK=Pakistan", "TR=Turkey",
0179: "UZ=Uzbekistan", "YE=Yemen" };
0180: private static final String[] TLD_SouthEastAsia = {
0181: "BD=Bangladesh", "BN=Brunei Darussalam", "BT=Bhutan",
0182: "CN=China", "HK=Hong Kong", "ID=Indonesia", "IN=India",
0183: "LA=Laos", "NP=Nepal", "JP=Japan", "KH=Cambodia",
0184: "KP=Korea (North)", "KR=Korea (South)", "LK=Sri Lanka",
0185: "MY=Malaysia",
0186: "MM=Myanmar", // formerly known as Burma
0187: "MO=Macau", // Portuguese settlement, part of China, but has some autonomy
0188: "MV=Maldives", // group of atolls in the Indian Ocean
0189: "PH=Philippines", "SG=Singapore", "TP=East Timor",
0190: "TH=Thailand", "TW=Taiwan", "VN=Viet Nam" };
0191: private static final String[] TLD_Africa = { "AO=Angola",
0192: "BF=Burkina Faso", "BI=Burundi", "BJ=Benin", "BW=Botswana",
0193: "CF=Central African Republic", "CG=Congo",
0194: "CI=Cote D'Ivoire (Ivory Coast)", "CM=Cameroon",
0195: "CV=Cape Verde", "DJ=Djibouti", "DZ=Algeria", "EG=Egypt",
0196: "EH=Western Sahara", "ER=Eritrea", "ET=Ethiopia",
0197: "GA=Gabon", "GH=Ghana", "GM=Gambia", "GN=Guinea",
0198: "GQ=Equatorial Guinea", "GW=Guinea-Bissau", "KE=Kenya",
0199: "KM=Comoros", "LR=Liberia", "LS=Lesotho", "LY=Libya",
0200: "MA=Morocco", "MG=Madagascar", "ML=Mali", "MR=Mauritania",
0201: "MU=Mauritius", "MW=Malawi", "MZ=Mozambique", "NA=Namibia",
0202: "NE=Niger", "NG=Nigeria", "RE=Reunion", "RW=Rwanda",
0203: "SC=Seychelles", "SD=Sudan", "SH=St. Helena",
0204: "SL=Sierra Leone", "SN=Senegal", "SO=Somalia",
0205: "ST=Sao Tome and Principe", "SZ=Swaziland", "TD=Chad",
0206: "TG=Togo", "TN=Tunisia", "TZ=Tanzania", "UG=Uganda",
0207: "ZA=South Africa", "ZM=Zambia", "ZR=Zaire", "ZW=Zimbabwe",
0208: "YT=Mayotte" };
0209: private static final String[] TLD_Generic = { "COM=US Commercial",
0210: "AERO=", "BIZ=", "COOP=", "INFO=", "MUSEUM=", "NAME=",
0211: "PRO=", "ARPA=", "INT=International", "ARPA=Arpanet",
0212: "NT=Neutral Zone" };
0213:
0214: /*
0215: * TLDs: aero, biz, com, coop, edu, gov, info, int, mil, museum, name, net,
0216: * org, pro, arpa AC, AD, AE, AERO, AF, AG, AI, AL, AM, AN, AO, AQ, AR,
0217: * ARPA, AS, AT, AU, AW, AZ, BA, BB, BD, BE, BF, BG, BH, BI, BIZ, BJ, BM,
0218: * BN, BO, BR, BS, BT, BV, BW, BY, BZ, CA, CC, CD, CF, CG, CH, CI, CK, CL,
0219: * CM, CN, CO, COM, COOP, CR, CU, CV, CX, CY, CZ, DE, DJ, DK, DM, DO, DZ,
0220: * EC, EDU, EE, EG, ER, ES, ET, EU, FI, FJ, FK, FM, FO, FR, GA, GB, GD, GE,
0221: * GF, GG, GH, GI, GL, GM, GN, GOV, GP, GQ, GR, GS, GT, GU, GW, GY, HK, HM,
0222: * HN, HR, HT, HU, ID, IE, IL, IM, IN, INFO, INT, IO, IQ, IR, IS, IT, JE,
0223: * JM, JO, JOBS, JP, KE, KG, KH, KI, KM, KN, KR, KW, KY, KZ, LA, LB, LC, LI,
0224: * LK, LR, LS, LT, LU, LV, LY, MA, MC, MD, MG, MH, MIL, MK, ML, MM, MN, MO,
0225: * MOBI, MP, MQ, MR, MS, MT, MU, MUSEUM, MV, MW, MX, MY, MZ, NA, NAME, NC,
0226: * NE, NET, NF, NG, NI, NL, NO, NP, NR, NU, NZ, OM, ORG, PA, PE, PF, PG, PH,
0227: * PK, PL, PM, PN, PR, PRO, PS, PT, PW, PY, QA, RE, RO, RU, RW, SA, SB, SC,
0228: * SD, SE, SG, SH, SI, SJ, SK, SL, SM, SN, SO, SR, ST, SU, SV, SY, SZ, TC,
0229: * TD, TF, TG, TH, TJ, TK, TL, TM, TN, TO, TP, TR, TRAVEL, TT, TV, TW, TZ,
0230: * UA, UG, UK, UM, US, UY, UZ, VA, VC, VE, VG, VI, VN, VU, WF, WS, YE, YT,
0231: * YU, ZA, ZM, ZW
0232: */
0233:
0234: public static String dummyHash;
0235:
0236: private static HashMap<String, Integer> TLDID = new HashMap<String, Integer>();
0237: private static HashMap<String, String> TLDName = new HashMap<String, String>();
0238:
0239: private static void insertTLDProps(String[] TLDList, int id) {
0240: int p;
0241: String tld, name;
0242: Integer ID = new Integer(id);
0243: for (int i = 0; i < TLDList.length; i++) {
0244: p = TLDList[i].indexOf('=');
0245: if (p > 0) {
0246: tld = TLDList[i].substring(0, p).toLowerCase();
0247: name = TLDList[i].substring(p + 1);
0248: TLDID.put(tld, ID);
0249: TLDName.put(tld, name);
0250: }
0251: }
0252: }
0253:
0254: static {
0255: // create a dummy hash
0256: dummyHash = "";
0257: for (int i = 0; i < yacySeedDB.commonHashLength; i++)
0258: dummyHash += "-";
0259:
0260: // assign TLD-ids and names
0261: insertTLDProps(TLD_EuropaRussia, 0);
0262: insertTLDProps(TLD_MiddleSouthAmerica, 1);
0263: insertTLDProps(TLD_SouthEastAsia, 2);
0264: insertTLDProps(TLD_MiddleEastWestAsia, 3);
0265: insertTLDProps(TLD_NorthAmericaOceania, 4);
0266: insertTLDProps(TLD_Africa, 5);
0267: insertTLDProps(TLD_Generic, 6);
0268: // the id=7 is used to flag local addresses
0269: }
0270:
0271: // class variables
0272: private String protocol, host, userInfo, path, quest, ref, hash;
0273: private int port;
0274:
0275: public yacyURL(String url, String hash)
0276: throws MalformedURLException {
0277: if (url == null)
0278: throw new MalformedURLException("url string is null");
0279: parseURLString(url);
0280: this .hash = hash;
0281: }
0282:
0283: private void parseURLString(String url)
0284: throws MalformedURLException {
0285: // identify protocol
0286: assert (url != null);
0287: url = url.trim();
0288: int p = url.indexOf(':');
0289: if (p < 0) {
0290: if (url.startsWith("www.")) {
0291: url = "http://" + url;
0292: p = 4;
0293: } else {
0294: throw new MalformedURLException(
0295: "protocol is not given in '" + url + "'");
0296: }
0297: }
0298: this .protocol = url.substring(0, p).toLowerCase().trim();
0299: if (url.length() < p + 4)
0300: throw new MalformedURLException("URL not parseable: '"
0301: + url + "'");
0302: if (url.substring(p + 1, p + 3).equals("//")) {
0303: // identify host, userInfo and file for http and ftp protocol
0304: int q = url.indexOf('/', p + 3);
0305: int r;
0306: if (q < 0) {
0307: if ((r = url.indexOf('@', p + 3)) < 0) {
0308: host = url.substring(p + 3);
0309: userInfo = null;
0310: } else {
0311: host = url.substring(r + 1);
0312: userInfo = url.substring(p + 3, r);
0313: }
0314: path = "/";
0315: } else {
0316: host = url.substring(p + 3, q);
0317: if ((r = host.indexOf('@')) < 0) {
0318: userInfo = null;
0319: } else {
0320: userInfo = host.substring(0, r);
0321: host = host.substring(r + 1);
0322: }
0323: path = url.substring(q);
0324: }
0325:
0326: path = resolveBackpath(path);
0327: identPort(url, (protocol.equals("http") ? 80 : ((protocol
0328: .equals("https")) ? 443
0329: : ((protocol.equals("ftp")) ? 21 : -1))));
0330: identRef();
0331: identQuest();
0332: escape();
0333: } else {
0334: // this is not a http or ftp url
0335: if (protocol.equals("mailto")) {
0336: // parse email url
0337: int q = url.indexOf('@', p + 3);
0338: if (q < 0) {
0339: throw new MalformedURLException(
0340: "wrong email address: " + url);
0341: } else {
0342: userInfo = url.substring(p + 1, q);
0343: host = url.substring(q + 1);
0344: path = null;
0345: port = -1;
0346: quest = null;
0347: ref = null;
0348: }
0349: } else {
0350: throw new MalformedURLException("unknown protocol: "
0351: + url);
0352: }
0353: }
0354: }
0355:
0356: public yacyURL(File file) throws MalformedURLException {
0357: this ("file", "", -1, file.getAbsolutePath());
0358: }
0359:
0360: public static yacyURL newURL(String baseURL, String relPath)
0361: throws MalformedURLException {
0362: if ((baseURL == null) || (relPath.startsWith("http://"))
0363: || (relPath.startsWith("https://"))
0364: || (relPath.startsWith("ftp://"))
0365: || (relPath.startsWith("file://"))
0366: || (relPath.startsWith("smb://"))) {
0367: return new yacyURL(relPath, null);
0368: } else {
0369: return new yacyURL(new yacyURL(baseURL, null), relPath);
0370: }
0371: }
0372:
0373: public static yacyURL newURL(yacyURL baseURL, String relPath)
0374: throws MalformedURLException {
0375: if ((baseURL == null) || (relPath.startsWith("http://"))
0376: || (relPath.startsWith("https://"))
0377: || (relPath.startsWith("ftp://"))
0378: || (relPath.startsWith("file://"))
0379: || (relPath.startsWith("smb://"))) {
0380: return new yacyURL(relPath, null);
0381: } else {
0382: return new yacyURL(baseURL, relPath);
0383: }
0384: }
0385:
0386: private yacyURL(yacyURL baseURL, String relPath)
0387: throws MalformedURLException {
0388: if (baseURL == null)
0389: throw new MalformedURLException("base URL is null");
0390: if (relPath == null)
0391: throw new MalformedURLException("relPath is null");
0392:
0393: this .hash = null;
0394: this .protocol = baseURL.protocol;
0395: this .host = baseURL.host;
0396: this .port = baseURL.port;
0397: this .userInfo = baseURL.userInfo;
0398: if (relPath.toLowerCase().startsWith("javascript:")) {
0399: this .path = baseURL.path;
0400: } else if ((relPath.startsWith("http://"))
0401: || (relPath.startsWith("https://"))
0402: || (relPath.startsWith("ftp://"))
0403: || (relPath.startsWith("file://"))
0404: || (relPath.startsWith("smb://"))) {
0405: this .path = baseURL.path;
0406: } else if (relPath.startsWith("/")) {
0407: this .path = relPath;
0408: } else if (baseURL.path.endsWith("/")) {
0409: if (relPath.startsWith("#") || relPath.startsWith("?")) {
0410: throw new MalformedURLException(
0411: "relative path malformed: " + relPath);
0412: } else {
0413: this .path = baseURL.path + relPath;
0414: }
0415: } else {
0416: if (relPath.startsWith("#") || relPath.startsWith("?")) {
0417: this .path = baseURL.path + relPath;
0418: } else {
0419: int q = baseURL.path.lastIndexOf('/');
0420: if (q < 0) {
0421: this .path = relPath;
0422: } else {
0423: this .path = baseURL.path.substring(0, q + 1)
0424: + relPath;
0425: }
0426: }
0427: }
0428: this .quest = baseURL.quest;
0429: this .ref = baseURL.ref;
0430:
0431: path = resolveBackpath(path);
0432: identRef();
0433: identQuest();
0434: escape();
0435: }
0436:
0437: public yacyURL(String protocol, String host, int port, String path)
0438: throws MalformedURLException {
0439: if (protocol == null)
0440: throw new MalformedURLException("protocol is null");
0441: this .protocol = protocol;
0442: this .host = host;
0443: this .port = port;
0444: this .path = path;
0445: this .hash = null;
0446: identRef();
0447: identQuest();
0448: escape();
0449: }
0450:
0451: // resolve '..'
0452: String resolveBackpath(String path) /* throws MalformedURLException */{
0453: /* original version by [MC]
0454: int p;
0455: while ((p = path.indexOf("/..")) >= 0) {
0456: String head = path.substring(0, p);
0457: int q = head.lastIndexOf('/');
0458: if (q < 0) throw new MalformedURLException("backpath cannot be resolved in path = " + path);
0459: path = head.substring(0, q) + path.substring(p + 3);
0460: }*/
0461:
0462: /* by [MT] */
0463: if (path.length() == 0 || path.charAt(0) != '/') {
0464: path = "/" + path;
0465: }
0466:
0467: Pattern pathPattern = Pattern
0468: .compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)");
0469: Matcher matcher = pathPattern.matcher(path);
0470: while (matcher.find()) {
0471: path = matcher.replaceAll("");
0472: matcher.reset(path);
0473: }
0474:
0475: return path.equals("") ? "/" : path;
0476: }
0477:
0478: /**
0479: * Escapes the following parts of the url, this object already contains:
0480: * <ul>
0481: * <li>path: see {@link #escape(String)}</li>
0482: * <li>ref: same as above</li>
0483: * <li>quest: same as above without the ampersand ("&") and the equals symbol</li>
0484: * </ul>
0485: */
0486: private void escape() {
0487: if (path != null && path.indexOf('%') == -1)
0488: escapePath();
0489: if (quest != null && quest.indexOf('%') == -1)
0490: escapeQuest();
0491: if (ref != null && ref.indexOf('%') == -1)
0492: escapeRef();
0493: }
0494:
0495: private void escapePath() {
0496: String[] pathp = path.split("/", -1);
0497: String ptmp = "";
0498: for (int i = 0; i < pathp.length; i++) {
0499: ptmp += "/" + escape(pathp[i]);
0500: }
0501: path = ptmp.substring((ptmp.length() > 0) ? 1 : 0);
0502: }
0503:
0504: private void escapeRef() {
0505: ref = escape(ref);
0506: }
0507:
0508: private void escapeQuest() {
0509: String[] questp = quest.split("&", -1);
0510: String qtmp = "";
0511: for (int i = 0; i < questp.length; i++) {
0512: if (questp[i].indexOf('=') != -1) {
0513: qtmp += "&"
0514: + escape(questp[i].substring(0, questp[i]
0515: .indexOf('=')));
0516: qtmp += "="
0517: + escape(questp[i].substring(questp[i]
0518: .indexOf('=') + 1));
0519: } else {
0520: qtmp += "&" + escape(questp[i]);
0521: }
0522: }
0523: quest = qtmp.substring((qtmp.length() > 0) ? 1 : 0);
0524: }
0525:
0526: private final static String[] hex = { "%00", "%01", "%02", "%03",
0527: "%04", "%05", "%06", "%07", "%08", "%09", "%0A", "%0B",
0528: "%0C", "%0D", "%0E", "%0F", "%10", "%11", "%12", "%13",
0529: "%14", "%15", "%16", "%17", "%18", "%19", "%1A", "%1B",
0530: "%1C", "%1D", "%1E", "%1F", "%20", "%21", "%22", "%23",
0531: "%24", "%25", "%26", "%27", "%28", "%29", "%2A", "%2B",
0532: "%2C", "%2D", "%2E", "%2F", "%30", "%31", "%32", "%33",
0533: "%34", "%35", "%36", "%37", "%38", "%39", "%3A", "%3B",
0534: "%3C", "%3D", "%3E", "%3F", "%40", "%41", "%42", "%43",
0535: "%44", "%45", "%46", "%47", "%48", "%49", "%4A", "%4B",
0536: "%4C", "%4D", "%4E", "%4F", "%50", "%51", "%52", "%53",
0537: "%54", "%55", "%56", "%57", "%58", "%59", "%5A", "%5B",
0538: "%5C", "%5D", "%5E", "%5F", "%60", "%61", "%62", "%63",
0539: "%64", "%65", "%66", "%67", "%68", "%69", "%6A", "%6B",
0540: "%6C", "%6D", "%6E", "%6F", "%70", "%71", "%72", "%73",
0541: "%74", "%75", "%76", "%77", "%78", "%79", "%7A", "%7B",
0542: "%7C", "%7D", "%7E", "%7F", "%80", "%81", "%82", "%83",
0543: "%84", "%85", "%86", "%87", "%88", "%89", "%8A", "%8B",
0544: "%8C", "%8D", "%8E", "%8F", "%90", "%91", "%92", "%93",
0545: "%94", "%95", "%96", "%97", "%98", "%99", "%9A", "%9B",
0546: "%9C", "%9D", "%9E", "%9F", "%A0", "%A1", "%A2", "%A3",
0547: "%A4", "%A5", "%A6", "%A7", "%A8", "%A9", "%AA", "%AB",
0548: "%AC", "%AD", "%AE", "%AF", "%B0", "%B1", "%B2", "%B3",
0549: "%B4", "%B5", "%B6", "%B7", "%B8", "%B9", "%BA", "%BB",
0550: "%BC", "%BD", "%BE", "%BF", "%C0", "%C1", "%C2", "%C3",
0551: "%C4", "%C5", "%C6", "%C7", "%C8", "%C9", "%CA", "%CB",
0552: "%CC", "%CD", "%CE", "%CF", "%D0", "%D1", "%D2", "%D3",
0553: "%D4", "%D5", "%D6", "%D7", "%D8", "%D9", "%DA", "%DB",
0554: "%DC", "%DD", "%DE", "%DF", "%E0", "%E1", "%E2", "%E3",
0555: "%E4", "%E5", "%E6", "%E7", "%E8", "%E9", "%EA", "%EB",
0556: "%EC", "%ED", "%EE", "%EF", "%F0", "%F1", "%F2", "%F3",
0557: "%F4", "%F5", "%F6", "%F7", "%F8", "%F9", "%FA", "%FB",
0558: "%FC", "%FD", "%FE", "%FF" };
0559:
0560: /**
0561: * Encode a string to the "x-www-form-urlencoded" form, enhanced
0562: * with the UTF-8-in-URL proposal. This is what happens:
0563: *
0564: * <ul>
0565: * <li>The ASCII characters 'a' through 'z', 'A' through 'Z',
0566: * and '0' through '9' remain the same.
0567: *
0568: * <li>The unreserved characters - _ . ! ~ * ' ( ) remain the same.
0569: *
0570: * <li>All other ASCII characters are converted into the
0571: * 3-character string "%xy", where xy is
0572: * the two-digit hexadecimal representation of the character
0573: * code
0574: *
0575: * <li>All non-ASCII characters are encoded in two steps: first
0576: * to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
0577: * secondly each of these bytes is encoded as "%xx".
0578: * </ul>
0579: *
0580: * @param s The string to be encoded
0581: * @return The encoded string
0582: */
0583: // from: http://www.w3.org/International/URLUTF8Encoder.java
0584: public static String escape(String s) {
0585: StringBuffer sbuf = new StringBuffer();
0586: int len = s.length();
0587: for (int i = 0; i < len; i++) {
0588: int ch = s.charAt(i);
0589: if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
0590: sbuf.append((char) ch);
0591: } else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
0592: sbuf.append((char) ch);
0593: } else if ('0' <= ch && ch <= '9') { // '0'..'9'
0594: sbuf.append((char) ch);
0595: } else if (ch == ' ') { // space
0596: sbuf.append("%20");
0597: } else if (ch == '&'
0598: || ch == ':' // unreserved
0599: || ch == '-' || ch == '_' || ch == '.' || ch == '!'
0600: || ch == '~' || ch == '*' || ch == '\''
0601: || ch == '(' || ch == ')' || ch == ';') {
0602: sbuf.append((char) ch);
0603: } else if (ch <= 0x007f) { // other ASCII
0604: sbuf.append(hex[ch]);
0605: } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
0606: sbuf.append(hex[0xc0 | (ch >> 6)]);
0607: sbuf.append(hex[0x80 | (ch & 0x3F)]);
0608: } else { // 0x7FF < ch <= 0xFFFF
0609: sbuf.append(hex[0xe0 | (ch >> 12)]);
0610: sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
0611: sbuf.append(hex[0x80 | (ch & 0x3F)]);
0612: }
0613: }
0614: return sbuf.toString();
0615: }
0616:
0617: // from: http://www.w3.org/International/unescape.java
0618: public static String unescape(String s) {
0619: StringBuffer sbuf = new StringBuffer();
0620: int l = s.length();
0621: int ch = -1;
0622: int b, sumb = 0;
0623: for (int i = 0, more = -1; i < l; i++) {
0624: /* Get next byte b from URL segment s */
0625: switch (ch = s.charAt(i)) {
0626: case '%':
0627: ch = s.charAt(++i);
0628: int hb = (Character.isDigit((char) ch) ? ch - '0'
0629: : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
0630: ch = s.charAt(++i);
0631: int lb = (Character.isDigit((char) ch) ? ch - '0'
0632: : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
0633: b = (hb << 4) | lb;
0634: break;
0635: case '+':
0636: b = ' ';
0637: break;
0638: default:
0639: b = ch;
0640: }
0641: /* Decode byte b as UTF-8, sumb collects incomplete chars */
0642: if ((b & 0xc0) == 0x80) { // 10xxxxxx (continuation byte)
0643: sumb = (sumb << 6) | (b & 0x3f); // Add 6 bits to sumb
0644: if (--more == 0)
0645: sbuf.append((char) sumb); // Add char to sbuf
0646: } else if ((b & 0x80) == 0x00) { // 0xxxxxxx (yields 7 bits)
0647: sbuf.append((char) b); // Store in sbuf
0648: } else if ((b & 0xe0) == 0xc0) { // 110xxxxx (yields 5 bits)
0649: sumb = b & 0x1f;
0650: more = 1; // Expect 1 more byte
0651: } else if ((b & 0xf0) == 0xe0) { // 1110xxxx (yields 4 bits)
0652: sumb = b & 0x0f;
0653: more = 2; // Expect 2 more bytes
0654: } else if ((b & 0xf8) == 0xf0) { // 11110xxx (yields 3 bits)
0655: sumb = b & 0x07;
0656: more = 3; // Expect 3 more bytes
0657: } else if ((b & 0xfc) == 0xf8) { // 111110xx (yields 2 bits)
0658: sumb = b & 0x03;
0659: more = 4; // Expect 4 more bytes
0660: } else /*if ((b & 0xfe) == 0xfc)*/{ // 1111110x (yields 1 bit)
0661: sumb = b & 0x01;
0662: more = 5; // Expect 5 more bytes
0663: }
0664: /* We don't test if the UTF-8 encoding is well-formed */
0665: }
0666: return sbuf.toString();
0667: }
0668:
0669: private void identPort(String inputURL, int dflt)
0670: throws MalformedURLException {
0671: // identify ref in file
0672: int r = this .host.indexOf(':');
0673: if (r < 0) {
0674: this .port = dflt;
0675: } else {
0676: try {
0677: String portStr = this .host.substring(r + 1);
0678: if (portStr.trim().length() > 0)
0679: this .port = Integer.parseInt(portStr);
0680: else
0681: this .port = -1;
0682: this .host = this .host.substring(0, r);
0683: } catch (NumberFormatException e) {
0684: throw new MalformedURLException(
0685: "wrong port in host fragment '" + this .host
0686: + "' of input url '" + inputURL + "'");
0687: }
0688: }
0689: }
0690:
0691: private void identRef() {
0692: // identify ref in file
0693: int r = path.indexOf('#');
0694: if (r < 0) {
0695: this .ref = null;
0696: } else {
0697: this .ref = path.substring(r + 1);
0698: this .path = path.substring(0, r);
0699: }
0700: }
0701:
0702: private void identQuest() {
0703: // identify quest in file
0704: int r = path.indexOf('?');
0705: if (r < 0) {
0706: this .quest = null;
0707: } else {
0708: this .quest = path.substring(r + 1);
0709: this .path = path.substring(0, r);
0710: }
0711: }
0712:
0713: public String getFile() {
0714: return getFile(true);
0715: }
0716:
0717: public String getFile(boolean includeReference) {
0718: // this is the path plus quest plus ref
0719: // if there is no quest and no ref the result is identical to getPath
0720: // this is defined according to http://java.sun.com/j2se/1.4.2/docs/api/java/net/URL.html#getFile()
0721: if (quest != null)
0722: return ((includeReference) && (ref != null)) ? path + "?"
0723: + quest + "#" + ref : path + "?" + quest;
0724: return ((includeReference) && (ref != null)) ? path + "#" + ref
0725: : path;
0726: }
0727:
0728: public String getFileName() {
0729: // this is a method not defined in any sun api
0730: // it returns the last portion of a path without any reference
0731: int p = path.lastIndexOf('/');
0732: if (p < 0)
0733: return path;
0734: if (p == path.length() - 1)
0735: return ""; // no file name, this is a path to a directory
0736: return path.substring(p + 1); // the 'real' file name
0737: }
0738:
0739: public String getPath() {
0740: return path;
0741: }
0742:
0743: public String getAuthority() {
0744: return ((port >= 0) && (host != null)) ? host + ":" + port
0745: : ((host != null) ? host : "");
0746: }
0747:
0748: public String getHost() {
0749: return host;
0750: }
0751:
0752: public int getPort() {
0753: return port;
0754: }
0755:
0756: public String getProtocol() {
0757: return protocol;
0758: }
0759:
0760: public String getRef() {
0761: return ref;
0762: }
0763:
0764: public String getUserInfo() {
0765: return userInfo;
0766: }
0767:
0768: public String getQuery() {
0769: return quest;
0770: }
0771:
0772: public String toString() {
0773: return toNormalform(false, true);
0774: }
0775:
0776: public String toNormalform(boolean stripReference, boolean stripAmp) {
0777: if (stripAmp)
0778: return toNormalform(!stripReference).replaceAll("&",
0779: "&");
0780: else
0781: return toNormalform(!stripReference);
0782: }
0783:
0784: private String toNormalform(boolean includeReference) {
0785: // generates a normal form of the URL
0786: boolean defaultPort = false;
0787: if (this .protocol.equals("mailto")) {
0788: return this .protocol + ":" + this .userInfo + "@"
0789: + this .host;
0790: } else if (this .protocol.equals("http")) {
0791: if (this .port < 0 || this .port == 80) {
0792: defaultPort = true;
0793: }
0794: } else if (this .protocol.equals("ftp")) {
0795: if (this .port < 0 || this .port == 21) {
0796: defaultPort = true;
0797: }
0798: } else if (this .protocol.equals("https")) {
0799: if (this .port < 0 || this .port == 443) {
0800: defaultPort = true;
0801: }
0802: }
0803: String path = resolveBackpath(this .getFile(includeReference));
0804:
0805: if (defaultPort) {
0806: return this .protocol
0807: + "://"
0808: + ((this .userInfo != null) ? (this .userInfo + "@")
0809: : ("")) + this .getHost().toLowerCase()
0810: + path;
0811: }
0812: return this .protocol
0813: + "://"
0814: + ((this .userInfo != null) ? (this .userInfo + "@")
0815: : ("")) + this .getHost().toLowerCase()
0816: + ((defaultPort) ? ("") : (":" + this .port)) + path;
0817: }
0818:
0819: public boolean equals(yacyURL other) {
0820: return (((this .protocol == other.protocol) || (this .protocol
0821: .equals(other.protocol)))
0822: && ((this .host == other.host) || (this .host
0823: .equals(other.host)))
0824: && ((this .userInfo == other.userInfo) || (this .userInfo
0825: .equals(other.userInfo)))
0826: && ((this .path == other.path) || (this .path
0827: .equals(other.path)))
0828: && ((this .quest == other.quest) || (this .quest
0829: .equals(other.quest)))
0830: && ((this .ref == other.ref) || (this .ref
0831: .equals(other.ref))) && ((this .port == other.port)));
0832: }
0833:
0834: public int hashCode() {
0835: return this .hash().hashCode();
0836: }
0837:
0838: public int compareTo(Object h) {
0839: assert (h instanceof yacyURL);
0840: return this .toString().compareTo(((yacyURL) h).toString());
0841: }
0842:
0843: public boolean isPOST() {
0844: return (this .quest != null) && (this .quest.length() > 0);
0845: }
0846:
0847: public boolean isCGI() {
0848: String ls = path.toLowerCase();
0849: return ((ls.indexOf(".cgi") >= 0) || (ls.indexOf(".exe") >= 0)
0850: || (ls.indexOf(";jsessionid=") >= 0)
0851: || (ls.indexOf("sessionid/") >= 0)
0852: || (ls.indexOf("phpsessid=") >= 0)
0853: || (ls.indexOf("search.php?sid=") >= 0) || (ls
0854: .indexOf("memberlist.php?sid=") >= 0));
0855: }
0856:
0857: // static methods from plasmaURL
0858:
0859: public static final int flagTypeID(String hash) {
0860: return (kelondroBase64Order.enhancedCoder.decodeByte(hash
0861: .charAt(11)) & 32) >> 5;
0862: }
0863:
0864: public static final int flagTLDID(String hash) {
0865: return (kelondroBase64Order.enhancedCoder.decodeByte(hash
0866: .charAt(11)) & 28) >> 2;
0867: }
0868:
0869: public static final int flagLengthID(String hash) {
0870: return (kelondroBase64Order.enhancedCoder.decodeByte(hash
0871: .charAt(11)) & 3);
0872: }
0873:
0874: public final String hash() {
0875: // in case that the object was initialized without a known url hash, compute it now
0876: if (this .hash == null)
0877: this .hash = urlHashComputation();
0878: return this .hash;
0879: }
0880:
0881: private final String urlHashComputation() {
0882: // the url hash computation needs a DNS lookup to check if the addresses domain is local
0883: // that causes that this method may be very slow
0884:
0885: assert this .hash == null; // should only be called if the hash was not computed bevore
0886:
0887: int p = this .host.lastIndexOf('.');
0888: String tld = "", dom = tld;
0889: if (p > 0) {
0890: tld = host.substring(p + 1);
0891: dom = host.substring(0, p);
0892: }
0893: Integer ID = (serverDomains.isLocal(tld)) ? null
0894: : (Integer) TLDID.get(tld); // identify local addresses
0895: int id = (ID == null) ? 7 : ID.intValue(); // local addresses are flagged with id=7
0896: boolean isHTTP = this .protocol.equals("http");
0897: p = dom.lastIndexOf('.'); // locate subdomain
0898: String subdom = "";
0899: if (p > 0) {
0900: subdom = dom.substring(0, p);
0901: dom = dom.substring(p + 1);
0902: }
0903:
0904: // find rootpath
0905: String pathx = new String(this .path);
0906: if (pathx.startsWith("/"))
0907: pathx = pathx.substring(1);
0908: if (pathx.endsWith("/"))
0909: pathx = pathx.substring(0, pathx.length() - 1);
0910: p = pathx.indexOf('/');
0911: String rootpath = "";
0912: if (p > 0) {
0913: rootpath = pathx.substring(0, p);
0914: }
0915:
0916: // we collected enough information to compute the fragments that are
0917: // basis for hashes
0918: int l = dom.length();
0919: int domlengthKey = (l <= 8) ? 0 : (l <= 12) ? 1 : (l <= 16) ? 2
0920: : 3;
0921: byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey);
0922:
0923: // combine the attributes
0924: StringBuffer hash = new StringBuffer(12);
0925: // form the 'local' part of the hash
0926: hash.append(kelondroBase64Order.enhancedCoder.encode(
0927: serverCodings.encodeMD5Raw(toNormalform(true, true)))
0928: .substring(0, 5)); // 5 chars
0929: hash.append(subdomPortPath(subdom, port, rootpath)); // 1 char
0930: // form the 'global' part of the hash
0931: hash.append(protocolHostPort(this .protocol, host, port)); // 5 chars
0932: hash.append(kelondroBase64Order.enhancedCoder
0933: .encodeByte(flagbyte)); // 1 char
0934:
0935: // return result hash
0936: return new String(hash);
0937: }
0938:
0939: private static char subdomPortPath(String subdom, int port,
0940: String rootpath) {
0941: return kelondroBase64Order.enhancedCoder.encode(
0942: serverCodings.encodeMD5Raw(subdom + ":" + port + ":"
0943: + rootpath)).charAt(0);
0944: }
0945:
0946: private static final char rootURLFlag0 = subdomPortPath("", 80, "");
0947: private static final char rootURLFlag1 = subdomPortPath("www", 80,
0948: "");
0949:
0950: public static final boolean probablyRootURL(String urlHash) {
0951: return (urlHash.charAt(5) == rootURLFlag0)
0952: || (urlHash.charAt(5) == rootURLFlag1);
0953: }
0954:
0955: private static String protocolHostPort(String protocol,
0956: String host, int port) {
0957: return kelondroBase64Order.enhancedCoder.encode(
0958: serverCodings.encodeMD5Raw(protocol + ":" + host + ":"
0959: + port)).substring(0, 5);
0960: }
0961:
0962: private static String[] testTLDs = new String[] { "com", "net",
0963: "org", "uk", "fr", "de", "es", "it" };
0964:
0965: public static final yacyURL probablyWordURL(String urlHash,
0966: TreeSet<String> words) {
0967: Iterator<String> wi = words.iterator();
0968: String word;
0969: while (wi.hasNext()) {
0970: word = wi.next();
0971: if ((word == null) || (word.length() == 0))
0972: continue;
0973: String pattern = urlHash.substring(6, 11);
0974: for (int i = 0; i < testTLDs.length; i++) {
0975: if (pattern.equals(protocolHostPort("http", "www."
0976: + word.toLowerCase() + "." + testTLDs[i], 80)))
0977: try {
0978: return new yacyURL("http://www."
0979: + word.toLowerCase() + "."
0980: + testTLDs[i], null);
0981: } catch (MalformedURLException e) {
0982: return null;
0983: }
0984: }
0985: }
0986: return null;
0987: }
0988:
0989: public static final boolean isWordRootURL(String givenURLHash,
0990: TreeSet<String> words) {
0991: if (!(probablyRootURL(givenURLHash)))
0992: return false;
0993: yacyURL wordURL = probablyWordURL(givenURLHash, words);
0994: if (wordURL == null)
0995: return false;
0996: if (wordURL.hash().equals(givenURLHash))
0997: return true;
0998: return false;
0999: }
1000:
1001: public static final int domLengthEstimation(String urlHash) {
1002: // generates an estimation of the original domain length
1003: assert (urlHash != null);
1004: assert (urlHash.length() == 12) : "urlhash = " + urlHash;
1005: int flagbyte = kelondroBase64Order.enhancedCoder
1006: .decodeByte(urlHash.charAt(11));
1007: int domLengthKey = flagbyte & 3;
1008: switch (domLengthKey) {
1009: case 0:
1010: return 4;
1011: case 1:
1012: return 10;
1013: case 2:
1014: return 14;
1015: case 3:
1016: return 20;
1017: }
1018: return 20;
1019: }
1020:
1021: public static int domLengthNormalized(String urlHash) {
1022: return domLengthEstimation(urlHash) << 8 / 20;
1023: }
1024:
1025: public static final int domDomain(String urlHash) {
1026: // returns the ID of the domain of the domain
1027: assert (urlHash != null);
1028: assert (urlHash.length() == 12) : "urlhash = " + urlHash;
1029: int flagbyte = kelondroBase64Order.enhancedCoder
1030: .decodeByte(urlHash.charAt(11));
1031: return (flagbyte & 12) >> 2;
1032: }
1033:
1034: public static boolean isGlobalDomain(String urlhash) {
1035: return domDomain(urlhash) != 7;
1036: }
1037:
1038: // checks for local/global IP range and local IP
1039: public boolean isLocal() {
1040: return serverDomains.isLocal(this .host);
1041: }
1042:
1043: // language calculation
1044: public static String language(yacyURL url) {
1045: String language = "uk";
1046: String host = url.getHost();
1047: int pos = host.lastIndexOf(".");
1048: if ((pos > 0) && (host.length() - pos == 3))
1049: language = host.substring(pos + 1).toLowerCase();
1050: return language;
1051: }
1052:
1053: public static void main(String[] args) {
1054: String[][] test = new String[][] {
1055: new String[] { null,
1056: "http://www.anomic.de/home/test?x=1#home" },
1057: new String[] { null,
1058: "http://www.anomic.de/home/test?x=1" },
1059: new String[] { null,
1060: "http://www.anomic.de/home/test#home" },
1061: new String[] { null,
1062: "ftp://ftp.anomic.de/home/test#home" },
1063: new String[] { null,
1064: "http://www.anomic.de/home/../abc/" },
1065: new String[] { null, "mailto:abcdefg@nomailnomail.com" },
1066: new String[] { "http://www.anomic.de/home", "test" },
1067: new String[] { "http://www.anomic.de/home", "test/" },
1068: new String[] { "http://www.anomic.de/home/", "test" },
1069: new String[] { "http://www.anomic.de/home/", "test/" },
1070: new String[] { "http://www.anomic.de/home/index.html",
1071: "test.htm" },
1072: new String[] { "http://www.anomic.de/home/index.html",
1073: "http://www.yacy.net/test" },
1074: new String[] { "http://www.anomic.de/home/index.html",
1075: "ftp://ftp.yacy.net/test" },
1076: new String[] { "http://www.anomic.de/home/index.html",
1077: "../test" },
1078: new String[] { "http://www.anomic.de/home/index.html",
1079: "mailto:abcdefg@nomailnomail.com" },
1080: new String[] { null, "news:de.test" },
1081: new String[] { "http://www.anomic.de/home",
1082: "news:de.test" },
1083: new String[] { "http://www.anomic.de/home",
1084: "ftp://ftp.anomic.de/src" },
1085: new String[] { null, "ftp://ftp.delegate.org/" },
1086: new String[] { "http://www.anomic.de/home",
1087: "ftp://ftp.delegate.org/" },
1088: new String[] { "http://www.anomic.de",
1089: "mailto:yacy@weltherrschaft.org" },
1090: new String[] { "http://www.anomic.de", "javascipt:temp" },
1091: new String[] {
1092: null,
1093: "http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history" },
1094: new String[] {
1095: null,
1096: "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585" },
1097: new String[] {
1098: null,
1099: "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585" } };
1100: String environment, url;
1101: yacyURL aURL, aURL1;
1102: java.net.URL jURL;
1103: for (int i = 0; i < test.length; i++) {
1104: environment = test[i][0];
1105: url = test[i][1];
1106: try {
1107: aURL = yacyURL.newURL(environment, url);
1108: } catch (MalformedURLException e) {
1109: aURL = null;
1110: }
1111: if (environment == null) {
1112: try {
1113: jURL = new java.net.URL(url);
1114: } catch (MalformedURLException e) {
1115: jURL = null;
1116: }
1117: } else {
1118: try {
1119: jURL = new java.net.URL(new java.net.URL(
1120: environment), url);
1121: } catch (MalformedURLException e) {
1122: jURL = null;
1123: }
1124: }
1125:
1126: // check equality to java.net.URL
1127: if (((aURL == null) && (jURL != null))
1128: || ((aURL != null) && (jURL == null))
1129: || ((aURL != null) && (jURL != null) && (!(jURL
1130: .toString().equals(aURL.toString()))))) {
1131: System.out.println("Difference for environment="
1132: + environment + ", url=" + url + ":");
1133: System.out
1134: .println((jURL == null) ? "jURL rejected input"
1135: : "jURL=" + jURL.toString());
1136: System.out
1137: .println((aURL == null) ? "aURL rejected input"
1138: : "aURL=" + aURL.toString());
1139: }
1140:
1141: // check stability: the normalform of the normalform must be equal to the normalform
1142: if (aURL != null)
1143: try {
1144: aURL1 = new yacyURL(aURL.toNormalform(false, true),
1145: null);
1146: if (!(aURL1.toNormalform(false, true).equals(aURL
1147: .toNormalform(false, true)))) {
1148: System.out.println("no stability for url:");
1149: System.out.println("aURL0=" + aURL.toString());
1150: System.out.println("aURL1=" + aURL1.toString());
1151: }
1152: } catch (MalformedURLException e) {
1153: System.out.println("no stability for url:");
1154: System.out.println("aURL0=" + aURL.toString());
1155: System.out.println("aURL1 cannot be computed:"
1156: + e.getMessage());
1157: }
1158: }
1159: }
1160: }
|