001: package net.javacoding.jspider.core.util;
002:
003: import java.net.MalformedURLException;
004: import java.net.URL;
005: import java.util.ArrayList;
006: import java.util.StringTokenizer;
007:
008: /**
009: * Some URL related methods gathered as static methods in this utility class.
010: *
011: * $Id: URLUtil.java,v 1.13 2003/04/29 17:53:49 vanrogu Exp $
012: *
013: * @author Günther Van Roey
014: */
015: public class URLUtil {
016:
017: /**
018: * Normalizes the given url by replacing '/./' by '/' and removes trailing slashes
019: * @param original the original URL to be normalized
020: * @return the normalized url
021: */
022: public static URL normalize(URL original) {
023: URL normalized = null;
024:
025: if (original != null) {
026:
027: String urlString = original.toString();
028:
029: urlString = normalizeDotFolders(urlString);
030: urlString = normalizeBackSlashes(urlString);
031: urlString = normalizeDoubleSlashes(urlString);
032: urlString = normalizeStripQuery(urlString);
033: //urlString = normalizeStripTrailingSlash(urlString) ;
034:
035: try {
036: normalized = new URL(urlString);
037: } catch (MalformedURLException e) {
038: }
039: }
040: return normalized;
041: }
042:
043: /**
044: * Replaces all backslashes by front slashes in the given url string
045: * @param original the original url string
046: * @return the url string with the normalization applied
047: */
048: protected static String normalizeBackSlashes(String original) {
049: return StringUtil.replace(original, "\\", "/");
050: }
051:
052: /**
053: * Replaces all double slashes by single slashes in the given url string
054: * @param original the original url string
055: * @return the url string with the normalization applied
056: */
057: protected static String normalizeDoubleSlashes(String original) {
058: return StringUtil.replace(original, "//", "/", 7);
059: }
060:
061: /**
062: * Removes all dot folders ( abc/./def/index.html, ...) from the given
063: * url string
064: * @param original the original url string
065: * @return the url string with the normalization applied
066: */
067: protected static String normalizeDotFolders(String original) {
068: return StringUtil.replace(original, "/./", "/");
069: }
070:
071: /**
072: * Strips an eventual query string from the resource (index.html?id=1
073: * becomes index.html for instance).
074: * @param original the original url string
075: * @return the url string with the normalization applied
076: */
077: protected static String normalizeStripQuery(String original) {
078: int index = original.indexOf('?');
079: if (index >= 0) {
080: return original.substring(0, index);
081: } else {
082: return original;
083: }
084: }
085:
086: /**
087: * Removes an evantual trailing slash from the given url string
088: * @param original the original url string
089: * @return the url string with the normalization applied
090: */
091: protected static String normalizeStripTrailingSlash(String original) {
092: if (original.endsWith("/")) {
093: return original.substring(0, original.length() - 1);
094: } else {
095: return original;
096: }
097: }
098:
099: /**
100: * Converts any resource URL to the site's url.
101: * @param resourceURL the url of the resource to find the url of the site for
102: * @return the URL pointing to the site in which the resource is located
103: */
104: public static URL getSiteURL(URL resourceURL) {
105: URL siteURL = null;
106: if (resourceURL != null) {
107: try {
108: siteURL = new URL(resourceURL.getProtocol(),
109: resourceURL.getHost(), resourceURL.getPort(),
110: "");
111: } catch (MalformedURLException e) {
112: // shouldn't happen, we're only dropping the PATH part of a valid URL ...
113: }
114: }
115: return siteURL;
116: }
117:
118: /**
119: * Reuturns the URL of the robots.txt resource in the site of the given resource.
120: * @param resourceURL the URL of the resource to find the site's robots.txt of
121: * @return URL pointing to the robots.txt resource of the site in which resourceURL is
122: */
123: public static URL getRobotsTXTURL(URL resourceURL) {
124: URL retVal = null;
125: if (resourceURL != null) {
126: try {
127: retVal = new URL(getSiteURL(resourceURL), "/robots.txt");
128: } catch (MalformedURLException e) {
129: }
130: }
131: return retVal;
132: }
133:
134: /**
135: * returns the resource path without the resource.
136: * @param path the path to the resource
137: * @return path without the resource itself
138: */
139: public static String stripResource(String path) {
140: String result = null;
141: if (path != null) {
142: int pos = path.lastIndexOf("/");
143: result = path.substring(0, pos + 1);
144: }
145: return result;
146: }
147:
148: /**
149: * Returns the 'depth' of the resource pointed to by the URL
150: * @param url the URL to the resource to calculate the depth of
151: * @return the depth of this resource in the site
152: */
153: public static int getDepth(URL url) {
154: int depth = 0;
155:
156: if (url != null) {
157: String path = url.getPath();
158: if (!isFileSpecified(url) && !path.endsWith("/")) {
159: path = path + "/";
160: }
161: int pos = path.indexOf('/');
162: while (pos != -1) {
163: if (pos > 0) {
164: depth++;
165: }
166: pos = path.indexOf('/', pos + 1);
167: }
168: }
169: return depth;
170: }
171:
172: /**
173: * Determines whether a file is specified in the path part of the url.
174: * This is assumed to be the case if the string after the last slash
175: * contains a dot (aaaaa/bbbb/cccc.dddd).
176: * @param url the url to test
177: * @return boolean value indicating whether a file is specified
178: */
179: public static boolean isFileSpecified(URL url) {
180: boolean specified = false;
181:
182: String path = url.getPath();
183: int posLastSlash = path.lastIndexOf('/');
184: int posLastDot = path.lastIndexOf('.');
185:
186: specified = posLastDot > posLastSlash;
187:
188: return specified;
189: }
190:
191: /**
192: * Returns an array of Strings being the folder names of the folders
193: * found in the given URL.
194: * @param url the url to parse the folders of
195: * @return an array of Strings containing all folder names
196: */
197: public static String[] getFolderNames(URL url) {
198: url = normalize(url);
199: ArrayList al = new ArrayList();
200:
201: String path = url.getPath();
202: if (isFileSpecified(url)) {
203: path = stripResource(path);
204: }
205: StringTokenizer st = new StringTokenizer(path, "/");
206:
207: while (st.hasMoreTokens()) {
208: al.add(st.nextToken());
209: }
210: return (String[]) al.toArray(new String[al.size()]);
211: }
212:
213: /**
214: * Returns the file name (without the path) of the resource specified
215: * by the given url.
216: * @param url the url to get the filename out of
217: * @return String containing the name of the file, zero-length if none
218: */
219: public static String getFileName(URL url) {
220: return url.getPath().substring(
221: stripResource(url.getPath()).length());
222: }
223:
224: }
|