001: /*
002: * Copyright 2005 by Lars Torunski
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.torunski.crawler;
018:
019: import java.util.Collection;
020:
021: import org.apache.commons.logging.Log;
022: import org.apache.commons.logging.LogFactory;
023:
024: import com.torunski.crawler.core.AbstractCrawler;
025: import com.torunski.crawler.link.Link;
026: import com.torunski.crawler.parser.PageData;
027: import com.torunski.crawler.util.StopWatch;
028:
029: /**
030: * Project: Smart & Simple Web Crawler
031: *
032: * Crawls through the web with a single thread.
033: *
034: * @author Lars Torunski
035: * @version $Revision: 1.17 $
036: */
037: public class Crawler extends AbstractCrawler {
038:
039: private static final transient Log log = LogFactory
040: .getLog(Crawler.class);
041:
042: private StopWatch total = new StopWatch();
043: private StopWatch loading = new StopWatch();
044: private StopWatch parsing = new StopWatch();
045: private StopWatch listener = new StopWatch();
046:
047: /**
048: * Constructor for Crawler.
049: */
050: public Crawler() {
051: }
052:
053: /**
054: * Starts the crawling process in a single thread.
055: */
056: public void start(String server, String start) {
057:
058: // set the default parser
059: if (parser == null) {
060: log
061: .debug("No parser set, defautling to SimpleHttpClientParser.");
062: parser = new com.torunski.crawler.parser.httpclient.SimpleHttpClientParser();
063: }
064:
065: // set default crawler model
066: if (model == null) {
067: log
068: .debug("No model set, defautling to MaxIterationsModel.");
069: model = new com.torunski.crawler.model.MaxIterationsModel();
070: }
071:
072: // initialize stop watch
073: total.reset();
074: loading.reset();
075: parsing.reset();
076: listener.reset();
077:
078: total.start();
079:
080: // add at least one link to the list
081: model.add(null, server + start);
082:
083: // starts the crawling process
084: start();
085:
086: total.stop();
087:
088: // output some statistics
089: if (log.isInfoEnabled()) {
090:
091: Collection visitedURIs = model.getVisitedURIs();
092: Collection toVisitURIs = model.getToVisitURIs();
093:
094: log.info("Visited URIs: " + visitedURIs.size());
095:
096: if (toVisitURIs.size() > 0) {
097: log.warn("still URIs to be visited, at least: "
098: + toVisitURIs.size());
099: }
100:
101: // output stop watch data
102: log.info("Total time: " + total.getTime() + " ms");
103: log.info("- loading: " + loading.getTime() + " ms");
104: log.info("- parsing: " + parsing.getTime() + " ms");
105: log.info("- listener: " + listener.getTime() + " ms");
106: }
107: }
108:
109: /**
110: * Starts the crawling process in a single thread.
111: * @see com.torunski.crawler.core.ICrawler#start()
112: */
113: public void start() {
114: // loop until there aren't any URIs anymore
115: while (!model.isEmpty()) {
116:
117: // remove a link from the stack
118: Link link = model.pop();
119:
120: // load the page
121: loading.start();
122: PageData pageData = parser.load(link);
123: loading.stop();
124:
125: if (pageData.getStatus() == PageData.OK) {
126: // get the links in the page
127: parsing.start();
128: Collection newURIs = parser.parse(pageData, linkFilter);
129: parsing.stop();
130:
131: listener.start();
132: fireParserEvent(link, pageData, newURIs);
133: listener.stop();
134:
135: // remove already visited URIs from the new URI list
136: newURIs.removeAll(model.getVisitedURIs());
137:
138: // the rest of the URIs can be visited
139: model.add(link, newURIs);
140: }
141: }
142: }
143:
144: }
|