001: /*
002: * Copyright 2006 Hippo.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package nl.hippo.util;
018:
019: import java.io.BufferedReader;
020: import java.io.ByteArrayInputStream;
021: import java.io.IOException;
022: import java.io.InputStreamReader;
023: import java.io.StringWriter;
024: import java.util.Collection;
025: import java.util.Date;
026: import java.util.HashMap;
027: import java.util.Iterator;
028: import java.util.LinkedList;
029: import java.util.List;
030: import java.util.Map;
031: import java.util.StringTokenizer;
032:
033: import org.apache.commons.httpclient.Credentials;
034: import org.apache.commons.httpclient.HttpClient;
035: import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
036: import org.apache.commons.httpclient.UsernamePasswordCredentials;
037: import org.apache.commons.httpclient.methods.HeadMethod;
038: import org.apache.commons.httpclient.methods.PutMethod;
039: import org.jdom.Content;
040: import org.jdom.Document;
041: import org.jdom.Element;
042: import org.jdom.Text;
043: import org.jdom.output.XMLOutputter;
044:
045: /**
046: * Check for broken links.
047: *
048: * @author Ugo Cei
049: * @version $Id: LinkChecker.java 861 2006-02-09 10:08:30Z ucei $
050: */
051: public final class LinkChecker {
052:
053: public static final char COLUMN_SEPARATOR_CHAR = '\t';
054: public static final String URL_SEPARATOR_CHARS = " ";
055: public static final String I18N_NS_URI = "http://apache.org/cocoon/i18n/2.1";
056: public static final String PLACEHOLDER_TEXT_BEFORE = "<?xml version='1.0'?>\n"
057: + "<broken-links date='";
058: public static final String PLACEHOLDER_TEXT_AFTER = "'><i18n:text xmlns:i18n='"
059: + I18N_NS_URI
060: + "'>message.link-checker-running</i18n:text></broken-links>\n";
061: public static final String HTTP_ERROR_PREFIX = "message.http-error.";
062:
063: private static final HttpClient client = new HttpClient(
064: new MultiThreadedHttpConnectionManager());
065:
066: /**
067: * Check for broken links. Expects as input an array of bytes with the contents of a text file.
068: * Each line in the file should contain a page URL from the repository, followed by a tab
069: * character and a space-separated list of link URLs to be checked.
070: * <p>Only external links (i.e. those not beginning with a forward slash) will be checked.
071: * HTTP links are verified by doing a HEAD request and checking that the result code is less
072: * than 300.</p>
073: * <p>During processing a placeholder file will be placed in the repository.</p>
074: *
075: * @param repository URL of the placeholder file.
076: * @param username Username for connecting to the repository.
077: * @param password Password for connecting to the repository.
078: * @param data Input data.
079: * @return A map whose keys are the page URLs and whose values are Lists. Each item of the list
080: * is an array of two strings; first string is the link URL, second one is a message detailing
081: * why the link is considered broken.
082: * @throws IOException
083: */
084: public static Map checkLinks(String repository, String username,
085: String password, byte[] data) throws IOException {
086: put(repository, username, password, PLACEHOLDER_TEXT_BEFORE
087: + new Date() + PLACEHOLDER_TEXT_AFTER);
088: Map brokenLinks = new HashMap();
089: BufferedReader reader = null;
090: try {
091: reader = new BufferedReader(new InputStreamReader(
092: new ByteArrayInputStream(data)));
093: String line;
094: while ((line = reader.readLine()) != null) {
095: int tabPos = line.indexOf(COLUMN_SEPARATOR_CHAR);
096: if (tabPos <= 0) {
097: // FIXME: log warning?
098: continue;
099: }
100: String pageUrl = line.substring(0, tabPos);
101: StringTokenizer st = new StringTokenizer(line
102: .substring(tabPos + 1), URL_SEPARATOR_CHARS);
103: while (st.hasMoreTokens()) {
104: String link = st.nextToken();
105: // Only links starting with '/' are considered internal.
106: Content error = null;
107: if (link.charAt(0) != '/'
108: && (error = checkLink(link)) != null) {
109: List links = (List) brokenLinks.get(pageUrl);
110: if (links == null) {
111: links = new LinkedList();
112: brokenLinks.put(pageUrl, links);
113: }
114: links.add(new Object[] { link, error });
115: }
116: }
117: }
118: } finally {
119: if (reader != null) {
120: reader.close();
121: }
122: }
123: return brokenLinks;
124: }
125:
126: /**
127: * Save the results of link checking to a repository file.
128: *
129: * @param repository URL of the output file.
130: * @param username Username for connecting to the repository.
131: * @param password Password for connecting to the repository.
132: * @param pages Broken links data.
133: * @throws IOException
134: */
135: public static void putResults(String repository, String username,
136: String password, Map pages) throws IOException {
137: Element root = new Element("broken-links");
138: root.setAttribute("date", new Date().toString());
139: for (Iterator it = pages.keySet().iterator(); it.hasNext();) {
140: String page = (String) it.next();
141: Element pageEl = new Element("page");
142: pageEl.setAttribute("url", page);
143: Collection links = (Collection) pages.get(page);
144: for (Iterator it2 = links.iterator(); it2.hasNext();) {
145: Object[] link = (Object[]) it2.next();
146: Element linkEl = new Element("link");
147: linkEl.setAttribute("url", (String) link[0]);
148: linkEl.addContent((Content) link[1]);
149: pageEl.addContent(linkEl);
150: }
151: root.addContent(pageEl);
152: }
153: Document doc = new Document(root);
154: XMLOutputter outputter = new XMLOutputter();
155: StringWriter sw = new StringWriter();
156: outputter.output(doc, sw);
157: String body = sw.toString();
158: put(repository, username, password, body);
159: }
160:
161: private static void put(String repository, String username,
162: String password, String body) throws IOException {
163: client.setConnectionTimeout(30000);
164: client.getState().setAuthenticationPreemptive(true);
165: Credentials defaultcreds = new UsernamePasswordCredentials(
166: username, password);
167: client.getState().setCredentials(null, null, defaultcreds);
168: PutMethod put = null;
169: try {
170: put = new PutMethod(repository);
171: put.setRequestBody(body);
172: client.executeMethod(put);
173: // FIXME: log this!
174: } finally {
175: if (put != null) {
176: put.releaseConnection();
177: }
178: }
179:
180: }
181:
182: private static Content checkLink(String link) {
183: client.setConnectionTimeout(30000);
184: HeadMethod head = null;
185: try {
186: head = new HeadMethod(link);
187: head.setFollowRedirects(true);
188: try {
189: int resultCode = client.executeMethod(head);
190: if (resultCode >= 300) {
191: Element el = new Element("text", "i18n",
192: I18N_NS_URI);
193: el.addContent(HTTP_ERROR_PREFIX
194: + new Integer(head.getStatusCode())
195: .toString());
196: return el;
197: }
198: } catch (Exception e) {
199: return new Text(e.toString());
200: }
201: return null;
202: } finally {
203: if (head != null) {
204: head.releaseConnection();
205: }
206: }
207: }
208: }
|