001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: */
019: package org.archive.crawler.framework;
020:
021: import java.util.Iterator;
022: import java.util.Map;
023:
024: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
025:
026: /**
027: * An interface for objects that want to collect statistics on
028: * running crawls. An implementation of this is referenced in the
029: * crawl order and loaded when the crawl begins.
030: *
031: * <p>It will be given a reference to the relevant CrawlController.
032: * The CrawlController will contain any additional configuration
033: * information needed.
034: *
035: * <p>Any class that implements this interface can be specified as a
036: * statistics tracker in a crawl order. The CrawlController will
037: * then create and initialize a copy of it and call it's start()
038: * method.
039: *
040: * <p>This interface also specifies several methods to access data that
041: * the CrawlController or the URIFrontier may be interested in at
042: * run time but do not want to have keep track of for themselves.
043: * {@link org.archive.crawler.framework.AbstractTracker AbstractTracker}
044: * implements these. If there are more then one StatisticsTracking
045: * classes defined in the crawl order only the first one will be
046: * used to access this data.
047: *
048: * <p>It is recommended that it register for
049: * {@link org.archive.crawler.event.CrawlStatusListener CrawlStatus} events and
050: * {@link org.archive.crawler.event.CrawlURIDispositionListener CrawlURIDisposition}
051: * events to be able to properly monitor a crawl. Both are registered with the
052: * CrawlController.
053: *
054: * @author Kristinn Sigurdsson
055: *
056: * @see AbstractTracker
057: * @see org.archive.crawler.event.CrawlStatusListener
058: * @see org.archive.crawler.event.CrawlURIDispositionListener
059: * @see org.archive.crawler.framework.CrawlController
060: */
061: public interface StatisticsTracking extends Runnable {
062: /** Seed successfully crawled */
063: public static final String SEED_DISPOSITION_SUCCESS = "Seed successfully crawled";
064: /** Failed to crawl seed */
065: public static final String SEED_DISPOSITION_FAILURE = "Failed to crawl seed";
066: /** Failed to crawl seed, will retry */
067: public static final String SEED_DISPOSITION_RETRY = "Failed to crawl seed, will retry";
068: /** Seed was disregarded */
069: public static final String SEED_DISPOSITION_DISREGARD = "Seed was disregarded";
070: /** Seed has not been processed */
071: public static final String SEED_DISPOSITION_NOT_PROCESSED = "Seed has not been processed";
072:
073: /**
074: * Do initialization.
075: *
076: * The CrawlController will call this method before calling the start()
077: * method.
078: *
079: * @param c The {@link CrawlController CrawlController} running the crawl
080: * that this class is to gather statistics on.
081: * @throws FatalConfigurationException
082: */
083: public void initialize(CrawlController c)
084: throws FatalConfigurationException;
085:
086: /**
087: * Returns how long the current crawl has been running (excluding any time
088: * spent paused/suspended/stopped) since it began.
089: *
090: * @return The length of time - in msec - that this crawl has been running.
091: */
092: public long crawlDuration();
093:
094: /**
095: * Start the tracker's crawl timing.
096: */
097: public void noteStart();
098:
099: /**
100: * Returns the total number of uncompressed bytes processed. Stored
101: * data may be much smaller due to compression or duplicate-reduction
102: * policies.
103: *
104: * @return The total number of uncompressed bytes written to disk
105: * @deprecated misnomer; use totalBytesCrawled instead
106: */
107: public long totalBytesWritten();
108:
109: /**
110: * Returns the total number of uncompressed bytes crawled. Stored
111: * data may be much smaller due to compression or duplicate-reduction
112: * policies.
113: *
114: * @return The total number of uncompressed bytes crawled
115: */
116: public long totalBytesCrawled();
117:
118: /**
119: * Total amount of time spent actively crawling so far.<p>
120: * Returns the total amount of time (in milliseconds) that has elapsed from
121: * the start of the crawl and until the current time or if the crawl has
122: * ended until the the end of the crawl <b>minus</b> any
123: * time spent paused.
124: * @return Total amount of time (in msec.) spent crawling so far.
125: */
126: public long getCrawlerTotalElapsedTime();
127:
128: /**
129: * Returns an estimate of recent document download rates
130: * based on a queue of recently seen CrawlURIs (as of last snapshot).
131: *
132: * @return The rate per second of documents gathered during the last
133: * snapshot
134: */
135: public double currentProcessedDocsPerSec();
136:
137: /**
138: * Returns the number of documents that have been processed
139: * per second over the life of the crawl (as of last snapshot)
140: *
141: * @return The rate per second of documents gathered so far
142: */
143: public double processedDocsPerSec();
144:
145: /**
146: * Calculates the rate that data, in kb, has been processed
147: * over the life of the crawl (as of last snapshot.)
148: *
149: * @return The rate per second of KB gathered so far
150: */
151: public long processedKBPerSec();
152:
153: /**
154: * Calculates an estimate of the rate, in kb, at which documents
155: * are currently being processed by the crawler. For more
156: * accurate estimates set a larger queue size, or get
157: * and average multiple values (as of last snapshot).
158: *
159: * @return The rate per second of KB gathered during the last snapshot
160: */
161: public int currentProcessedKBPerSec();
162:
163: /**
164: * Get the number of active (non-paused) threads.
165: *
166: * @return The number of active (non-paused) threads
167: */
168: public int activeThreadCount();
169:
170: /**
171: * Number of <i>successfully</i> processed URIs.
172: *
173: * <p>If crawl not running (paused or stopped) this will return the value
174: * of the last snapshot.
175: *
176: * @return The number of successully fetched URIs
177: *
178: * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
179: */
180: public long successfullyFetchedCount();
181:
182: /**
183: * @return Total number of URIs (processed + queued +
184: * currently being processed)
185: */
186: public long totalCount();
187:
188: public float congestionRatio();
189:
190: public long deepestUri();
191:
192: public long averageDepth();
193:
194: /**
195: * Get a SeedRecord iterator for the job being monitored. If job is no
196: * longer running, stored values will be returned. If job is running,
197: * current seed iterator will be fetched and stored values will be updated.
198: * <p>
199: * Sort order is:<br>
200: * No status code (not processed)<br>
201: * Status codes smaller then 0 (largest to smallest)<br>
202: * Status codes larger then 0 (largest to smallest)<br>
203: * <p>
204: * <b>Note:</b> This iterator will iterate over a list of
205: * <i>SeedRecords</i>.
206: * @return the seed iterator
207: */
208: public Iterator getSeedRecordsSortedByStatusCode();
209:
210: /**
211: * @return legend of progress-statistics
212: */
213: public String progressStatisticsLegend();
214:
215: /**
216: * @return line of progress-statistics
217: */
218: public String getProgressStatisticsLine();
219:
220: /**
221: * @return Map of progress-statistics.
222: */
223: public Map getProgressStatistics();
224: }
|