01: /*
02: * Project: ExampleServer
03: *
04: * $Id: ExampleServer.java,v 1.2 2006/08/26 10:33:11 ltorunski Exp $
05: */
06: package com.torunski.crawler.examples;
07:
08: import java.util.Collection;
09: import java.util.Iterator;
10:
11: import com.torunski.crawler.Crawler;
12: import com.torunski.crawler.filter.ServerFilter;
13:
14: /**
15: * Example for a simple crawling process.
16: *
17: * Description: Command line example to crawl a web site starting from root. It uses a "ServerFilter" and the default "Max Iterations" model with a maximum of 32 links.
18: * Result: Using www.spiegel.de as the parameter 32 pages are visted and more than 400 pages are left.
19: *
20: * @author Lars Torunski
21: * @version $Id: ExampleServer.java,v 1.2 2006/08/26 10:33:11 ltorunski Exp $
22: */
23: public class ExampleServer {
24:
25: public static void main(String[] args) {
26:
27: if (args.length != 1) {
28: System.out.println("ExampleServer for Crawler");
29: System.out
30: .println("Usage: java com.torunski.crawler.examples.ExampleServer [http server]");
31: return;
32: }
33:
34: Crawler crawler = new Crawler();
35: crawler.setLinkFilter(new ServerFilter(args[0]));
36: crawler.start(args[0], "/");
37:
38: // show visited links
39: Collection visitedLinks = crawler.getModel().getVisitedURIs();
40: System.out.println("Links visited=" + visitedLinks.size());
41:
42: Iterator list = visitedLinks.iterator();
43: while (list.hasNext()) {
44: System.out.println(list.next());
45: }
46:
47: // show visited links
48: Collection notVisitedLinks = crawler.getModel()
49: .getToVisitURIs();
50:
51: System.out.println("Links NOT visited="
52: + notVisitedLinks.size());
53: Iterator listNot = notVisitedLinks.iterator();
54: while (listNot.hasNext()) {
55: System.out.println(listNot.next());
56: }
57: }
58:
59: }
|