001: /*
002: * Copyright 2005 by Lars Torunski
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.torunski.crawler.util;
018:
019: import java.util.ArrayList;
020: import java.util.Collection;
021:
022: import org.apache.commons.httpclient.URI;
023: import org.apache.commons.httpclient.URIException;
024:
025: import org.apache.commons.logging.Log;
026: import org.apache.commons.logging.LogFactory;
027:
028: import com.torunski.crawler.filter.ILinkFilter;
029:
030: /**
031: * Helpful methods for links.
032: *
033: * @author Lars Torunski
034: * @version $Revision: 1.9 $
035: */
036: public class LinksUtil {
037:
038: private static final transient Log log = LogFactory
039: .getLog(LinksUtil.class);
040:
041: private LinksUtil() {
042: }
043:
044: /**
045: * @param currentLink the current page in which the new link is contained
046: * @param newLink the to be completed link
047: * @return a full qualified link or <code>null</code> if the newLink can't be parsed.
048: */
049: public static final String getURI(String currentLink, String newLink) {
050: if (newLink == null) {
051: return null;
052: }
053:
054: try {
055: // workaround for http:/path/example.htm
056: if (!newLink.startsWith("http://")
057: && newLink.startsWith("http:/")) {
058: newLink = newLink.substring(5);
059: }
060:
061: // workaround for https:/path/example.htm
062: if (!newLink.startsWith("https://")
063: && newLink.startsWith("https:/")) {
064: newLink = newLink.substring(5);
065: }
066:
067: // create new URIs
068: // TODO check new URI constructors
069: URI base = new URI(currentLink, false);
070: URI newURI = new URI(base, newLink, false);
071:
072: // ignore the schemes other than http
073: if (!"http".equals(newURI.getScheme())
074: && !"https".equals(newURI.getScheme())) {
075: return null;
076: }
077:
078: return newURI.toString();
079: } catch (URIException e) {
080: log.info("URI problem with current link '" + currentLink
081: + "' and new link '" + newLink + '\'', e);
082: return null;
083: }
084: }
085:
086: /**
087: * @param url the url (origin) of the page
088: * @param content the complete web page
089: * @return a collection of new links
090: */
091: public static Collection retrieveLinks(String url, String content,
092: ILinkFilter linkFilter) {
093: // FIXME performance and memory!!!
094: String pageLower = content.toLowerCase();
095:
096: Collection result = new ArrayList();
097:
098: // find all the links
099: int pos = 0;
100: while (pos < content.length()) {
101: // find a link
102: // FIXME performance with reg expression
103: // TODO image links extraction
104: // TODO <a class="sub" href="../download/meilensteine_bhf_bank.pdf" onclick="return loadPDF(this.href,1);"
105: // onkeypress="return loadPDF(this.href);" target="_blank">439KB</a>
106: // TODO frame link extraction
107: int start = pageLower.indexOf("<a href=\"", pos);
108: if (start != -1) {
109: int end = content.indexOf("\"", start + 9);
110:
111: // create a full qualified link
112: String link = LinksUtil.getURI(url, content.substring(
113: start + 9, end));
114:
115: // if no filter is set or a set filter accepts the link, then add it to the list
116: if ((link != null)
117: && ((linkFilter == null) || (linkFilter.accept(
118: url, link)))) {
119: result.add(link);
120: }
121:
122: // next parsing position
123: pos = end + 1;
124: } else {
125: // end parsing
126: pos = content.length();
127: }
128: }
129:
130: return result;
131: }
132:
133: }
|