001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: */
019: package org.archive.crawler.framework;
020:
021: import java.io.Serializable;
022: import java.util.Date;
023: import java.util.EventObject;
024: import java.util.logging.Level;
025:
026: import javax.management.AttributeNotFoundException;
027:
028: import org.archive.crawler.event.CrawlStatusListener;
029: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
030: import org.archive.crawler.settings.ModuleType;
031: import org.archive.crawler.settings.SimpleType;
032: import org.archive.crawler.settings.Type;
033: import org.archive.util.ArchiveUtils;
034: import org.archive.util.PaddingStringBuffer;
035: import org.xbill.DNS.DClass;
036: import org.xbill.DNS.Lookup;
037:
038: /**
039: * A partial implementation of the StatisticsTracking interface.
040: * <p>
041: * It covers the thread handling. (Launching, pausing etc.) Included in this is
042: * keeping track of the total time spent (actually) crawling. Several methods
043: * to access the time started, finished etc. are provided.
044: * <p>
045: * To handle the thread work the class implements the CrawlStatusListener and
046: * uses it's events to pause, resume and stop logging of statistics. The run()
047: * method will call logActivity() at intervals specified in the crawl order.
048: * <p>
049: * Implementation of logActivity (the actual logging) as well as listening for
050: * CrawlURIDisposition events is not addressed.
051: *
052: * @author Kristinn Sigurdsson
053: *
054: * @see org.archive.crawler.framework.StatisticsTracking
055: * @see org.archive.crawler.admin.StatisticsTracker
056: */
057: public abstract class AbstractTracker extends ModuleType implements
058: StatisticsTracking, CrawlStatusListener, Serializable {
059: /** Default period between logging stat values */
060: public static final Integer DEFAULT_STATISTICS_REPORT_INTERVAL = new Integer(
061: 20);
062: /** Attribute name for logging interval in seconds setting
063: */
064: public static final String ATTR_STATS_INTERVAL = "interval-seconds";
065:
066: /** A reference to the CrawlContoller of the crawl that we are to track
067: * statistics for.
068: */
069: protected transient CrawlController controller;
070:
071: // Keep track of time.
072: protected long crawlerStartTime;
073: protected long crawlerEndTime = -1; // Until crawl ends, this value is -1.
074: protected long crawlerPauseStarted = 0;
075: protected long crawlerTotalPausedTime = 0;
076:
077: /** Timestamp of when this logger last wrote something to the log */
078: protected long lastLogPointTime;
079:
080: protected boolean shouldrun = true;
081:
082: /**
083: * @param name
084: * @param description
085: */
086: public AbstractTracker(String name, String description) {
087: super (name, description);
088: Type e = addElementToDefinition(new SimpleType(
089: ATTR_STATS_INTERVAL,
090: "The interval between writing progress information to log.",
091: DEFAULT_STATISTICS_REPORT_INTERVAL));
092: e.setOverrideable(false);
093: }
094:
095: /**
096: * Sets up the Logger (including logInterval) and registers with the
097: * CrawlController for CrawlStatus and CrawlURIDisposition events.
098: *
099: * @param c A crawl controller instance.
100: * @throws FatalConfigurationException Not thrown here. For overrides that
101: * go to settings system for configuration.
102: * @see CrawlStatusListener
103: * @see org.archive.crawler.event.CrawlURIDispositionListener
104: */
105: public void initialize(CrawlController c)
106: throws FatalConfigurationException {
107: this .controller = c;
108:
109: // Add listeners
110: this .controller.addCrawlStatusListener(this );
111: }
112:
113: /**
114: * Start thread. Will call logActivity() at intervals specified by
115: * logInterval
116: *
117: */
118: public void run() {
119: // Don't start logging if we have no logger
120: if (this .controller == null) {
121: return;
122: }
123:
124: shouldrun = true; //If we are starting, this should always be true.
125:
126: // Log the legend
127: this .controller
128: .logProgressStatistics(progressStatisticsLegend());
129: lastLogPointTime = System.currentTimeMillis(); // The first interval begins now.
130:
131: // Keep logging until someone calls stop()
132: while (shouldrun) {
133: // Pause before writing the first entry (so we have real numbers)
134: // and then pause between entries
135: try {
136: Thread.sleep(getLogWriteInterval() * 1000);
137: } catch (InterruptedException e) {
138: e.printStackTrace();
139: controller.runtimeErrors
140: .log(Level.INFO,
141: "Periodic stat logger interrupted while sleeping.");
142: }
143:
144: // In case stop() was invoked while the thread was sleeping or we
145: // are paused.
146: if (shouldrun && getCrawlPauseStartedTime() == 0) {
147: progressStatisticsEvent(new EventObject(this ));
148: }
149: }
150: }
151:
152: /**
153: * @return legend for progress-statistics lines/log
154: */
155: public String progressStatisticsLegend() {
156: return " timestamp"
157: + " discovered "
158: + " queued downloaded doc/s(avg) KB/s(avg) "
159: + " dl-failures busy-thread mem-use-KB heap-size-KB "
160: + " congestion max-depth avg-depth";
161: }
162:
163: /**
164: * Notify tracker that crawl has begun. Must be called
165: * outside tracker's own thread, to ensure it is noted
166: * before other threads start interacting with tracker.
167: */
168: public void noteStart() {
169: if (this .crawlerStartTime == 0) {
170: // Note the time the crawl starts (only if not already set)
171: this .crawlerStartTime = System.currentTimeMillis();
172: }
173: }
174:
175: /**
176: * A method for logging current crawler state.
177: *
178: * This method will be called by run() at intervals specified in
179: * the crawl order file. It is also invoked when pausing or
180: * stopping a crawl to capture the state at that point. Default behavior is
181: * call to {@link CrawlController#logProgressStatistics} so CrawlController
182: * can act on progress statistics event.
183: * <p>
184: * It is recommended that for implementations of this method it be
185: * carefully considered if it should be synchronized in whole or in
186: * part
187: * @param e Progress statistics event.
188: */
189: protected synchronized void progressStatisticsEvent(
190: final EventObject e) {
191: this .controller.progressStatisticsEvent(e);
192: // temporary workaround for
193: // [ 996161 ] Fix DNSJava issues (memory) -- replace with JNDI-DNS?
194: // http://sourceforge.net/support/tracker.php?aid=996161
195: Lookup.getDefaultCache(DClass.IN).clearCache();
196: }
197:
198: /**
199: * Get the starting time of the crawl (as given by
200: * <code>System.currentTimeMillis()</code> when the crawl started).
201: * @return time fo the crawl's start
202: */
203: public long getCrawlStartTime() {
204: return this .crawlerStartTime;
205: }
206:
207: /**
208: * If crawl has ended it will return the time it ended (given by
209: * <code>System.currentTimeMillis()</code> at that time).
210: * <br>
211: * If crawl is still going on it will return the same as
212: * <code>System.currentTimeMillis()</code> at the time of the call.
213: * @return The time of the crawl ending or the current time if the crawl has
214: * not ended.
215: */
216: public long getCrawlEndTime() {
217: return (this .crawlerEndTime == -1) ? System.currentTimeMillis()
218: : this .crawlerEndTime;
219: }
220:
221: /**
222: * Returns the number of milliseconds that the crawl spent paused or
223: * otherwise in a nonactive state.
224: * @return the number of msec. that the crawl was paused or otherwise
225: * suspended.
226: */
227: public long getCrawlTotalPauseTime() {
228: return this .crawlerTotalPausedTime;
229: }
230:
231: /**
232: * Get the time when the the crawl was last paused/suspended (as given by
233: * <code>System.currentTimeMillis()</code> at that time). Will be 0 if the
234: * crawl is not currently paused.
235: * @return time of the crawl's last pause/suspend or 0 if the crawl is not
236: * currently paused.
237: */
238: public long getCrawlPauseStartedTime() {
239: return this .crawlerPauseStarted;
240: }
241:
242: public long getCrawlerTotalElapsedTime() {
243: if (getCrawlStartTime() == 0) {
244: // if no start time set yet, consider elapsed time zero
245: return 0;
246: }
247:
248: return (getCrawlPauseStartedTime() != 0) ?
249: // Are currently paused, calculate time up to last pause
250: (getCrawlPauseStartedTime() - getCrawlTotalPauseTime() - getCrawlStartTime())
251: :
252: // Not paused, calculate total time.
253: (getCrawlEndTime() - getCrawlTotalPauseTime() - getCrawlStartTime());
254: }
255:
256: /**
257: * The number of seconds to wait between writing snapshot data to log file.
258: * @return the number of seconds to wait between writing snapshot data to
259: * log file.
260: */
261: protected int getLogWriteInterval() {
262: int logInterval;
263: try {
264: logInterval = ((Integer) getAttribute(null,
265: ATTR_STATS_INTERVAL)).intValue();
266: } catch (AttributeNotFoundException e) {
267: logInterval = 10;
268: }
269: return logInterval;
270: }
271:
272: /**
273: * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
274: */
275: public void crawlPausing(String statusMessage) {
276: logNote("CRAWL WAITING - " + statusMessage);
277: }
278:
279: protected void logNote(final String note) {
280: this .controller.logProgressStatistics(new PaddingStringBuffer()
281: .append(ArchiveUtils.get14DigitDate()).append(" ")
282: .append(note).toString());
283: }
284:
285: public void crawlPaused(String statusMessage) {
286: crawlerPauseStarted = System.currentTimeMillis();
287: progressStatisticsEvent(new EventObject(this ));
288: logNote("CRAWL PAUSED - " + statusMessage);
289: }
290:
291: public void crawlResuming(String statusMessage) {
292: tallyCurrentPause();
293: logNote("CRAWL RESUMED - " + statusMessage);
294: lastLogPointTime = System.currentTimeMillis();
295: }
296:
297: /**
298: * For a current pause (if any), add paused time to total and reset
299: */
300: protected void tallyCurrentPause() {
301: if (this .crawlerPauseStarted > 0) {
302: // Ok, we managed to actually pause before resuming.
303: this .crawlerTotalPausedTime += (System.currentTimeMillis() - this .crawlerPauseStarted);
304: }
305: this .crawlerPauseStarted = 0;
306: }
307:
308: public void crawlEnding(String sExitMessage) {
309: logNote("CRAWL ENDING - " + sExitMessage);
310: }
311:
312: /**
313: * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
314: */
315: public void crawlEnded(String sExitMessage) {
316: // Note the time when the crawl stops.
317: crawlerEndTime = System.currentTimeMillis();
318: progressStatisticsEvent(new EventObject(this ));
319: logNote("CRAWL ENDED - " + sExitMessage);
320: shouldrun = false;
321: dumpReports();
322: finalCleanup();
323: }
324:
325: public void crawlStarted(String message) {
326: tallyCurrentPause();
327: noteStart();
328: }
329:
330: /**
331: * Dump reports, if any, on request or at crawl end.
332: */
333: protected void dumpReports() {
334: // by default do nothing; subclasses may override
335: }
336:
337: /**
338: * Cleanup resources used, at crawl end.
339: */
340: protected void finalCleanup() {
341: controller = null; // Facilitate GC.
342: }
343:
344: /**
345: * @see org.archive.crawler.framework.StatisticsTracking#crawlDuration()
346: */
347: public long crawlDuration() {
348: return getCrawlerTotalElapsedTime();
349: }
350: }
|