001: package com.quadcap.http.client;
002:
003: /* Copyright 1998 - 2003 Quadcap Software. All rights reserved.
004: *
005: * This software is distributed under the Quadcap Free Software License.
006: * This software may be used or modified for any purpose, personal or
007: * commercial. Open Source redistributions are permitted. Commercial
008: * redistribution of larger works derived from, or works which bundle
009: * this software requires a "Commercial Redistribution License"; see
010: * http://www.quadcap.com/purchase.
011: *
012: * Redistributions qualify as "Open Source" under one of the following terms:
013: *
014: * Redistributions are made at no charge beyond the reasonable cost of
015: * materials and delivery.
016: *
017: * Redistributions are accompanied by a copy of the Source Code or by an
018: * irrevocable offer to provide a copy of the Source Code for up to three
019: * years at the cost of materials and delivery. Such redistributions
020: * must allow further use, modification, and redistribution of the Source
021: * Code under substantially the same terms as this license.
022: *
023: * Redistributions of source code must retain the copyright notices as they
024: * appear in each source code file, these license terms, and the
025: * disclaimer/limitation of liability set forth as paragraph 6 below.
026: *
027: * Redistributions in binary form must reproduce this Copyright Notice,
028: * these license terms, and the disclaimer/limitation of liability set
029: * forth as paragraph 6 below, in the documentation and/or other materials
030: * provided with the distribution.
031: *
032: * The Software is provided on an "AS IS" basis. No warranty is
033: * provided that the Software is free of defects, or fit for a
034: * particular purpose.
035: *
036: * Limitation of Liability. Quadcap Software shall not be liable
037: * for any damages suffered by the Licensee or any third party resulting
038: * from use of the Software.
039: */
040:
041: import java.io.*;
042:
043: import java.util.ArrayList;
044: import java.util.Collections;
045: import java.util.HashMap;
046: import java.util.Iterator;
047:
048: import org.xml.sax.AttributeList;
049: import org.xml.sax.DocumentHandler;
050: import org.xml.sax.DTDHandler;
051: import org.xml.sax.EntityResolver;
052: import org.xml.sax.ErrorHandler;
053: import org.xml.sax.HandlerBase;
054: import org.xml.sax.InputSource;
055: import org.xml.sax.Locator;
056: import org.xml.sax.SAXException;
057:
058: import com.quadcap.text.sax.Parser;
059:
060: import com.quadcap.http.util.HeaderParser;
061:
062: import com.quadcap.util.collections.ArrayQueue;
063: import com.quadcap.util.collections.DiGraph;
064:
065: import com.quadcap.util.text.OctetMap;
066: import com.quadcap.util.text.Scanner;
067:
068: import com.quadcap.util.Debug;
069: import com.quadcap.util.Util;
070:
071: /**
072: * This class implements a simple link checker, following links
073: * in the following tags:
074: *
075: * <ul>
076: * <li><b><A HREF="">
077: * <li><b><IMG SRC="">
078: * <li><b><FRAME SRC="">
079: * </ul>
080: */
081: public class LinkChecker implements DocumentHandler {
082: /** uri of the document we're currently fetching and parsing */
083: String base;
084:
085: /** base uri of the current document for relative href resolution */
086: String urlBase;
087:
088: /** uri of the document we're currently fetching and parsing */
089: String currentUrl;
090:
091: /** directed graph of all links found so far (even bad ones...) */
092: DiGraph links = new DiGraph();
093:
094: /** queue of links to check */
095: ArrayQueue linksToCheck = new ArrayQueue();
096:
097: /** uri -> status for all links */
098: HashMap allLinks = new HashMap();
099:
100: /** uri -> status for completed links */
101: HashMap linksChecked = new HashMap();
102:
103: Parser parser;
104: String host;
105:
106: public LinkChecker(String url) {
107: parser = new Parser();
108: String s = url;
109: if (s.startsWith("http://")) {
110: s = url.substring("http://".length());
111: }
112: int idx = s.indexOf('/');
113: if (idx > 0)
114: s = s.substring(0, idx);
115: host = "http://" + s;
116: push(url, 0);
117: }
118:
119: synchronized void push(String url, int line) {
120: if (allLinks.get(url) == null && url.startsWith(host)) {
121: System.out.println("PUSH " + trim(base) + " -> "
122: + trim(url));
123: if (currentUrl != null) {
124: links.addArc(currentUrl + ":" + line, url);
125: }
126: allLinks.put(url, "queued");
127: linksToCheck.push(url);
128: }
129: }
130:
131: String trim(String url) {
132: if (url != null && url.startsWith(host)) {
133: url = url.substring(host.length());
134: }
135: return url;
136: }
137:
138: public void printBadLinks() {
139: ArrayList k = new ArrayList();
140: Iterator iter = linksChecked.keySet().iterator();
141: while (iter.hasNext()) {
142: String url = iter.next().toString();
143: String val = linksChecked.get(url).toString();
144: if (!val.equals("found")) {
145: Iterator x = links.getParents(url);
146: String ref = x.hasNext() ? x.next().toString() : "";
147: k.add(trim(ref) + "\n error: " + trim(url));
148: }
149: }
150: Collections.sort(k);
151: iter = k.iterator();
152: while (iter.hasNext()) {
153: System.out.println(iter.next().toString());
154: }
155: System.out.println("--------------------\n");
156: System.out.println("" + k.size() + " errors");
157: }
158:
159: public void run() throws Exception {
160: //HtmlParser parser = new HtmlParser();
161: int cnt = 0;
162: while (linksToCheck.size() > 0) {
163: System.out.print("" + (linksChecked.size() + 1) + " of "
164: + (linksToCheck.size() + linksChecked.size())
165: + ": ");
166: String url = linksToCheck.popBack().toString();
167: if (linksChecked.get(url) != null)
168: continue;
169: System.out.println(trim(url));
170: currentUrl = url;
171: InputStream is = null;
172: try {
173: is = HttpFetcher.fetchStream(url);
174: Scanner scanner = new Scanner(is);
175: HashMap headers = new HashMap();
176: scanner.skipUntil(OctetMap.wsChars);
177: scanner.skipWhile(OctetMap.wsChars);
178: String resp = scanner.parseUntil(OctetMap.crlfChars);
179: HeaderParser.parseCRLF(scanner);
180: HeaderParser.parseHeaders(scanner, headers);
181: if (!resp.startsWith("200")) {
182: allLinks.put(url, "missing");
183: linksChecked.put(url, "missing");
184: Iterator iter = links.getParents(url);
185: String referrer = iter.hasNext() ? iter.next()
186: .toString() : "---";
187: System.err.println("*** " + trim(url) + ","
188: + trim(referrer) + "," + resp);
189: continue;
190: }
191: String mimeType = (String) headers.get("content-type");
192: if (mimeType == null || !mimeType.equals("text/html")) {
193: continue;
194: }
195: InputStreamReader r = new InputStreamReader(is);
196: InputSource in = new InputSource(r);
197: in.setSystemId(url);
198: parser.setDocumentHandler(this );
199: setBase(url);
200: parser.parse(in);
201: allLinks.put(url, "found");
202: linksChecked.put(url, "found");
203: } catch (IOException e) {
204: Debug.print(e);
205: allLinks.put(url, "error");
206: linksChecked.put(url, "error");
207: } catch (Exception e3) {
208: Debug.print(e3);
209: allLinks.put(url, "exception");
210: linksChecked.put(url, "exception");
211: } catch (Throwable t) {
212: Debug.print(t);
213: allLinks.put(url, "exception");
214: linksChecked.put(url, "exception");
215: } finally {
216: if (is != null)
217: is.close();
218: //System.out.println("Result: " + allLinks.get(url));
219: }
220: }
221: }
222:
223: public void setBase(String base) {
224: this .base = base;
225: this .urlBase = parent(base);
226: if (base.endsWith("/"))
227: urlBase = base;
228: }
229:
230: public void startDocument() {
231: }
232:
233: public void endDocument() {
234: }
235:
236: public void ignorableWhitespace(char[] ch, int off, int cnt)
237: throws SAXException {
238: characters(ch, off, cnt);
239: }
240:
241: public void processingInstruction(String target, String data) {
242: }
243:
244: public void setDocumentLocator(Locator locator) {
245: }
246:
247: public void startElement(String tag, AttributeList attrs)
248: throws SAXException {
249: try {
250: if (tag.equalsIgnoreCase("a")) {
251: String href = attrs.getValue("href");
252: if (href != null)
253: checkHref(href, parser.getLineNumber());
254: } else if (tag.equalsIgnoreCase("img")
255: || tag.equalsIgnoreCase("frame")) {
256: String href = attrs.getValue("src");
257: if (href != null)
258: checkHref(href, parser.getLineNumber());
259: }
260: } catch (Throwable t) {
261: t.printStackTrace(System.err);
262: //System.err.println(t.toString());
263: System.err.println("tag = " + tag);
264: System.err.println("attrs = " + attrs);
265: System.err.println("urlBase = " + urlBase);
266: }
267: }
268:
269: public void characters(char[] ch, int off, int len)
270: throws SAXException {
271: }
272:
273: public void endElement(String tag) throws SAXException {
274: }
275:
276: public void checkHref(String href, int line) {
277: String tbase = urlBase;
278: href = href.trim();
279: if (href.length() > 0 && href.charAt(0) == '/') {
280: href = href.substring(1);
281: tbase = "";
282: } else if (href.startsWith("http://")) {
283: tbase = "";
284: } else if (href.startsWith("ftp://")
285: || href.startsWith("mailto:")) {
286: return;
287: } else {
288: while (href.startsWith("./") || href.startsWith("../")) {
289: if (href.startsWith("./")) {
290: href = href.substring(2);
291: } else if (href.startsWith("../")) {
292: href = href.substring(3);
293: tbase = parent(tbase);
294: }
295: }
296: }
297: String url = tbase + href;
298: int idx = url.indexOf('#');
299: if (idx >= 0) {
300: url = url.substring(0, idx);
301: }
302: if (url.length() == 0)
303: return;
304: push(url, line);
305: }
306:
307: static String parent(String s) {
308: for (int i = s.length() - 2; i >= 0; i--) {
309: if (s.charAt(i) == '/')
310: return s.substring(0, i + 1);
311: }
312: throw new RuntimeException("Bad parent: " + s);
313: }
314: }
|