001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: *
017: */
018: package org.apache.ivy.util.url;
019:
020: import java.io.BufferedReader;
021: import java.io.IOException;
022: import java.io.InputStreamReader;
023: import java.net.URL;
024: import java.util.ArrayList;
025: import java.util.List;
026: import java.util.Locale;
027: import java.util.regex.Matcher;
028: import java.util.regex.Pattern;
029:
030: import org.apache.ivy.util.FileUtil;
031: import org.apache.ivy.util.Message;
032:
033: /**
034: * Utility class which helps to list urls under a given url. This has been tested with Apache 1.3.33
035: * server listing, as the one used at ibiblio, and with Apache 2.0.53 server listing, as the one on
036: * mirrors.sunsite.dk.
037: */
038: public class ApacheURLLister {
039: // ~ Static variables/initializers ------------------------------------------
040:
041: private static final Pattern PATTERN = Pattern
042: .compile(
043: "<a[^>]*href=\"([^\"]*)\"[^>]*>(?:<[^>]+>)*?([^<>]+?)(?:<[^>]+>)*?</a>",
044: Pattern.CASE_INSENSITIVE);
045:
046: // ~ Methods ----------------------------------------------------------------
047:
048: /**
049: * Returns a list of sub urls of the given url. The returned list is a list of URL.
050: *
051: * @param url
052: * The base URL from which to retrieve the listing.
053: * @return a list of sub urls of the given url.
054: * @throws IOException
055: * If an error occures retrieving the HTML.
056: */
057: public List listAll(URL url) throws IOException {
058: return retrieveListing(url, true, true);
059: }
060:
061: /**
062: * Returns a list of sub 'directories' of the given url. The returned list is a list of URL.
063: *
064: * @param url
065: * The base URL from which to retrieve the listing.
066: * @return a list of sub 'directories' of the given url.
067: * @throws IOException
068: * If an error occures retrieving the HTML.
069: */
070: public List listDirectories(URL url) throws IOException {
071: return retrieveListing(url, false, true);
072: }
073:
074: /**
075: * Returns a list of sub 'files' (in opposition to directories) of the given url. The returned
076: * list is a list of URL.
077: *
078: * @param url
079: * The base URL from which to retrieve the listing.
080: * @return a list of sub 'files' of the given url.
081: * @throws IOException
082: * If an error occures retrieving the HTML.
083: */
084: public List listFiles(URL url) throws IOException {
085: return retrieveListing(url, true, false);
086: }
087:
088: /**
089: * Retrieves a {@link List} of {@link URL}s corresponding to the files and/or directories found
090: * at the supplied base URL.
091: *
092: * @param url
093: * The base URL from which to retrieve the listing.
094: * @param includeFiles
095: * If true include files in the returned list.
096: * @param includeDirectories
097: * If true include directories in the returned list.
098: * @return A {@link List} of {@link URL}s.
099: * @throws IOException
100: * If an error occures retrieving the HTML.
101: */
102: public List retrieveListing(URL url, boolean includeFiles,
103: boolean includeDirectories) throws IOException {
104: List urlList = new ArrayList();
105:
106: // add trailing slash for relative urls
107: if (!url.getPath().endsWith("/")
108: && !url.getPath().endsWith(".html")) {
109: url = new URL(url.getProtocol(), url.getHost(), url
110: .getPort(), url.getPath() + "/");
111: }
112:
113: BufferedReader r = new BufferedReader(new InputStreamReader(
114: URLHandlerRegistry.getDefault().openStream(url)));
115:
116: String htmlText = FileUtil.readEntirely(r);
117:
118: Matcher matcher = PATTERN.matcher(htmlText);
119:
120: while (matcher.find()) {
121: // get the href text and the displayed text
122: String href = matcher.group(1);
123: String text = matcher.group(2);
124:
125: if ((href == null) || (text == null)) {
126: // the groups were not found (shouldn't happen, really)
127: continue;
128: }
129:
130: text = text.trim();
131:
132: // absolute href: convert to relative one
133: if (href.startsWith("/")) {
134: int slashIndex = href.substring(0, href.length() - 1)
135: .lastIndexOf('/');
136: href = href.substring(slashIndex + 1);
137: }
138:
139: // relative to current href: convert to simple relative one
140: if (href.startsWith("./")) {
141: href = href.substring("./".length());
142: }
143:
144: // exclude those where they do not match
145: // href will never be truncated, text may be truncated by apache
146: // may have a '.' from either the extension (.jar) or "..>"
147: int dotIndex = text.indexOf('.');
148:
149: if (((dotIndex != -1) && !href.startsWith(text.substring(0,
150: dotIndex)))
151: || ((dotIndex == -1) && !href
152: .toLowerCase(Locale.US).equals(
153: text.toLowerCase(Locale.US)))) {
154: // the href and the text do not "match"
155: continue;
156: }
157:
158: boolean directory = href.endsWith("/");
159:
160: if ((directory && includeDirectories)
161: || (!directory && includeFiles)) {
162: URL child = new URL(url, href);
163: urlList.add(child);
164: Message.debug("ApacheURLLister found URL=[" + child
165: + "].");
166: }
167: }
168:
169: return urlList;
170: }
171: }
|