Source Code Cross Referenced for LinkChecker.java in » Web-Server » Quadcap-Web-Server » com » quadcap » http » client » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Server » Quadcap Web Server » com.quadcap.http.client

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        package com.quadcap.http.client;
002:
003:        /* Copyright 1998 - 2003 Quadcap Software.  All rights reserved.
004:         *
005:         * This software is distributed under the Quadcap Free Software License.
006:         * This software may be used or modified for any purpose, personal or
007:         * commercial.  Open Source redistributions are permitted.  Commercial
008:         * redistribution of larger works derived from, or works which bundle
009:         * this software requires a "Commercial Redistribution License"; see
010:         * http://www.quadcap.com/purchase.
011:         *
012:         * Redistributions qualify as "Open Source" under  one of the following terms:
013:         *   
014:         *    Redistributions are made at no charge beyond the reasonable cost of
015:         *    materials and delivery.
016:         *
017:         *    Redistributions are accompanied by a copy of the Source Code or by an
018:         *    irrevocable offer to provide a copy of the Source Code for up to three
019:         *    years at the cost of materials and delivery.  Such redistributions
020:         *    must allow further use, modification, and redistribution of the Source
021:         *    Code under substantially the same terms as this license.
022:         *
023:         * Redistributions of source code must retain the copyright notices as they
024:         * appear in each source code file, these license terms, and the
025:         * disclaimer/limitation of liability set forth as paragraph 6 below.
026:         *
027:         * Redistributions in binary form must reproduce this Copyright Notice,
028:         * these license terms, and the disclaimer/limitation of liability set
029:         * forth as paragraph 6 below, in the documentation and/or other materials
030:         * provided with the distribution.
031:         *
032:         * The Software is provided on an "AS IS" basis.  No warranty is
033:         * provided that the Software is free of defects, or fit for a
034:         * particular purpose.  
035:         *
036:         * Limitation of Liability. Quadcap Software shall not be liable
037:         * for any damages suffered by the Licensee or any third party resulting
038:         * from use of the Software.
039:         */
040:
041:        import java.io.*;
042:
043:        import java.util.ArrayList;
044:        import java.util.Collections;
045:        import java.util.HashMap;
046:        import java.util.Iterator;
047:
048:        import org.xml.sax.AttributeList;
049:        import org.xml.sax.DocumentHandler;
050:        import org.xml.sax.DTDHandler;
051:        import org.xml.sax.EntityResolver;
052:        import org.xml.sax.ErrorHandler;
053:        import org.xml.sax.HandlerBase;
054:        import org.xml.sax.InputSource;
055:        import org.xml.sax.Locator;
056:        import org.xml.sax.SAXException;
057:
058:        import com.quadcap.text.sax.Parser;
059:
060:        import com.quadcap.http.util.HeaderParser;
061:
062:        import com.quadcap.util.collections.ArrayQueue;
063:        import com.quadcap.util.collections.DiGraph;
064:
065:        import com.quadcap.util.text.OctetMap;
066:        import com.quadcap.util.text.Scanner;
067:
068:        import com.quadcap.util.Debug;
069:        import com.quadcap.util.Util;
070:
071:        /**
072:         * This class implements a simple link checker, following links
073:         * in the following tags:
074:         *
075:         * <ul>
076:         * <li><b>&lt;A HREF=""&gt;
077:         * <li><b>&lt;IMG SRC=""&gt;
078:         * <li><b>&lt;FRAME SRC=""&gt;
079:         * </ul>
080:         */
081:        public class LinkChecker implements  DocumentHandler {
082:            /** uri of the document we're currently fetching and parsing   */
083:            String base;
084:
085:            /** base uri of the current document for relative href resolution */
086:            String urlBase;
087:
088:            /** uri of the document we're currently fetching and parsing   */
089:            String currentUrl;
090:
091:            /** directed graph of all links found so far (even bad ones...) */
092:            DiGraph links = new DiGraph();
093:
094:            /** queue of links to check */
095:            ArrayQueue linksToCheck = new ArrayQueue();
096:
097:            /** uri -> status for all links */
098:            HashMap allLinks = new HashMap();
099:
100:            /** uri -> status for completed links */
101:            HashMap linksChecked = new HashMap();
102:
103:            Parser parser;
104:            String host;
105:
106:            public LinkChecker(String url) {
107:                parser = new Parser();
108:                String s = url;
109:                if (s.startsWith("http://")) {
110:                    s = url.substring("http://".length());
111:                }
112:                int idx = s.indexOf('/');
113:                if (idx > 0)
114:                    s = s.substring(0, idx);
115:                host = "http://" + s;
116:                push(url, 0);
117:            }
118:
119:            synchronized void push(String url, int line) {
120:                if (allLinks.get(url) == null && url.startsWith(host)) {
121:                    System.out.println("PUSH " + trim(base) + " -> "
122:                            + trim(url));
123:                    if (currentUrl != null) {
124:                        links.addArc(currentUrl + ":" + line, url);
125:                    }
126:                    allLinks.put(url, "queued");
127:                    linksToCheck.push(url);
128:                }
129:            }
130:
131:            String trim(String url) {
132:                if (url != null && url.startsWith(host)) {
133:                    url = url.substring(host.length());
134:                }
135:                return url;
136:            }
137:
138:            public void printBadLinks() {
139:                ArrayList k = new ArrayList();
140:                Iterator iter = linksChecked.keySet().iterator();
141:                while (iter.hasNext()) {
142:                    String url = iter.next().toString();
143:                    String val = linksChecked.get(url).toString();
144:                    if (!val.equals("found")) {
145:                        Iterator x = links.getParents(url);
146:                        String ref = x.hasNext() ? x.next().toString() : "";
147:                        k.add(trim(ref) + "\n    error: " + trim(url));
148:                    }
149:                }
150:                Collections.sort(k);
151:                iter = k.iterator();
152:                while (iter.hasNext()) {
153:                    System.out.println(iter.next().toString());
154:                }
155:                System.out.println("--------------------\n");
156:                System.out.println("" + k.size() + " errors");
157:            }
158:
159:            public void run() throws Exception {
160:                //HtmlParser parser = new HtmlParser();
161:                int cnt = 0;
162:                while (linksToCheck.size() > 0) {
163:                    System.out.print("" + (linksChecked.size() + 1) + " of "
164:                            + (linksToCheck.size() + linksChecked.size())
165:                            + ": ");
166:                    String url = linksToCheck.popBack().toString();
167:                    if (linksChecked.get(url) != null)
168:                        continue;
169:                    System.out.println(trim(url));
170:                    currentUrl = url;
171:                    InputStream is = null;
172:                    try {
173:                        is = HttpFetcher.fetchStream(url);
174:                        Scanner scanner = new Scanner(is);
175:                        HashMap headers = new HashMap();
176:                        scanner.skipUntil(OctetMap.wsChars);
177:                        scanner.skipWhile(OctetMap.wsChars);
178:                        String resp = scanner.parseUntil(OctetMap.crlfChars);
179:                        HeaderParser.parseCRLF(scanner);
180:                        HeaderParser.parseHeaders(scanner, headers);
181:                        if (!resp.startsWith("200")) {
182:                            allLinks.put(url, "missing");
183:                            linksChecked.put(url, "missing");
184:                            Iterator iter = links.getParents(url);
185:                            String referrer = iter.hasNext() ? iter.next()
186:                                    .toString() : "---";
187:                            System.err.println("*** " + trim(url) + ","
188:                                    + trim(referrer) + "," + resp);
189:                            continue;
190:                        }
191:                        String mimeType = (String) headers.get("content-type");
192:                        if (mimeType == null || !mimeType.equals("text/html")) {
193:                            continue;
194:                        }
195:                        InputStreamReader r = new InputStreamReader(is);
196:                        InputSource in = new InputSource(r);
197:                        in.setSystemId(url);
198:                        parser.setDocumentHandler(this );
199:                        setBase(url);
200:                        parser.parse(in);
201:                        allLinks.put(url, "found");
202:                        linksChecked.put(url, "found");
203:                    } catch (IOException e) {
204:                        Debug.print(e);
205:                        allLinks.put(url, "error");
206:                        linksChecked.put(url, "error");
207:                    } catch (Exception e3) {
208:                        Debug.print(e3);
209:                        allLinks.put(url, "exception");
210:                        linksChecked.put(url, "exception");
211:                    } catch (Throwable t) {
212:                        Debug.print(t);
213:                        allLinks.put(url, "exception");
214:                        linksChecked.put(url, "exception");
215:                    } finally {
216:                        if (is != null)
217:                            is.close();
218:                        //System.out.println("Result: " + allLinks.get(url));
219:                    }
220:                }
221:            }
222:
223:            public void setBase(String base) {
224:                this .base = base;
225:                this .urlBase = parent(base);
226:                if (base.endsWith("/"))
227:                    urlBase = base;
228:            }
229:
230:            public void startDocument() {
231:            }
232:
233:            public void endDocument() {
234:            }
235:
236:            public void ignorableWhitespace(char[] ch, int off, int cnt)
237:                    throws SAXException {
238:                characters(ch, off, cnt);
239:            }
240:
241:            public void processingInstruction(String target, String data) {
242:            }
243:
244:            public void setDocumentLocator(Locator locator) {
245:            }
246:
247:            public void startElement(String tag, AttributeList attrs)
248:                    throws SAXException {
249:                try {
250:                    if (tag.equalsIgnoreCase("a")) {
251:                        String href = attrs.getValue("href");
252:                        if (href != null)
253:                            checkHref(href, parser.getLineNumber());
254:                    } else if (tag.equalsIgnoreCase("img")
255:                            || tag.equalsIgnoreCase("frame")) {
256:                        String href = attrs.getValue("src");
257:                        if (href != null)
258:                            checkHref(href, parser.getLineNumber());
259:                    }
260:                } catch (Throwable t) {
261:                    t.printStackTrace(System.err);
262:                    //System.err.println(t.toString());
263:                    System.err.println("tag = " + tag);
264:                    System.err.println("attrs = " + attrs);
265:                    System.err.println("urlBase = " + urlBase);
266:                }
267:            }
268:
269:            public void characters(char[] ch, int off, int len)
270:                    throws SAXException {
271:            }
272:
273:            public void endElement(String tag) throws SAXException {
274:            }
275:
276:            public void checkHref(String href, int line) {
277:                String tbase = urlBase;
278:                href = href.trim();
279:                if (href.length() > 0 && href.charAt(0) == '/') {
280:                    href = href.substring(1);
281:                    tbase = "";
282:                } else if (href.startsWith("http://")) {
283:                    tbase = "";
284:                } else if (href.startsWith("ftp://")
285:                        || href.startsWith("mailto:")) {
286:                    return;
287:                } else {
288:                    while (href.startsWith("./") || href.startsWith("../")) {
289:                        if (href.startsWith("./")) {
290:                            href = href.substring(2);
291:                        } else if (href.startsWith("../")) {
292:                            href = href.substring(3);
293:                            tbase = parent(tbase);
294:                        }
295:                    }
296:                }
297:                String url = tbase + href;
298:                int idx = url.indexOf('#');
299:                if (idx >= 0) {
300:                    url = url.substring(0, idx);
301:                }
302:                if (url.length() == 0)
303:                    return;
304:                push(url, line);
305:            }
306:
307:            static String parent(String s) {
308:                for (int i = s.length() - 2; i >= 0; i--) {
309:                    if (s.charAt(i) == '/')
310:                        return s.substring(0, i + 1);
311:                }
312:                throw new RuntimeException("Bad parent: " + s);
313:            }
314:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.