0001: /* Copyright (C) 2003 Internet Archive.
0002: *
0003: * This file is part of the Heritrix web crawler (crawler.archive.org).
0004: *
0005: * Heritrix is free software; you can redistribute it and/or modify
0006: * it under the terms of the GNU Lesser Public License as published by
0007: * the Free Software Foundation; either version 2.1 of the License, or
0008: * any later version.
0009: *
0010: * Heritrix is distributed in the hope that it will be useful,
0011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0013: * GNU Lesser Public License for more details.
0014: *
0015: * You should have received a copy of the GNU Lesser Public License
0016: * along with Heritrix; if not, write to the Free Software
0017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0018: *
0019: * Created on Jul 16, 2003
0020: *
0021: */
0022: package org.archive.crawler.admin;
0023:
0024: import java.io.File;
0025: import java.io.FileWriter;
0026: import java.io.IOException;
0027: import java.io.PrintWriter;
0028: import java.io.Serializable;
0029: import java.util.Comparator;
0030: import java.util.Date;
0031: import java.util.EventObject;
0032: import java.util.Hashtable;
0033: import java.util.Iterator;
0034: import java.util.List;
0035: import java.util.Map;
0036: import java.util.HashMap;
0037: import java.util.SortedMap;
0038: import java.util.TreeMap;
0039: import java.util.TreeSet;
0040: import java.util.Vector;
0041: import java.util.logging.Level;
0042: import java.util.logging.Logger;
0043:
0044: import org.archive.crawler.datamodel.CrawlURI;
0045: import org.archive.crawler.event.CrawlURIDispositionListener;
0046: import org.archive.crawler.framework.AbstractTracker;
0047: import org.archive.crawler.framework.CrawlController;
0048: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
0049: import org.archive.crawler.util.CrawledBytesHistotable;
0050: import org.archive.net.UURI;
0051: import org.archive.util.ArchiveUtils;
0052: import org.archive.util.Histotable;
0053: import org.archive.util.LongWrapper;
0054: import org.archive.util.MimetypeUtils;
0055: import org.archive.util.PaddingStringBuffer;
0056:
0057: /**
0058: * This is an implementation of the AbstractTracker. It is designed to function
0059: * with the WUI as well as performing various logging activity.
0060: * <p>
0061: * At the end of each snapshot a line is written to the
0062: * 'progress-statistics.log' file.
0063: * <p>
0064: * The header of that file is as follows:
0065: * <pre> [timestamp] [discovered] [queued] [downloaded] [doc/s(avg)] [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
0066: * First there is a <b>timestamp</b>, accurate down to 1 second.
0067: * <p>
0068: * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
0069: * are (respectively) the discovered URI count, pending URI count, successfully
0070: * fetched count and failed fetch count from the frontier at the time of the
0071: * snapshot.
0072: * <p>
0073: * <b>KB/s(avg)</b> is the bandwidth usage. We use the total bytes downloaded
0074: * to calculate average bandwidth usage (KB/sec). Since we also note the value
0075: * each time a snapshot is made we can calculate the average bandwidth usage
0076: * during the last snapshot period to gain a "current" rate. The first number is
0077: * the current and the average is in parenthesis.
0078: * <p>
0079: * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
0080: * documents (URIs) rather then KB downloaded.
0081: * <p>
0082: * <b>busy-threads</b> is the total number of ToeThreads that are not available
0083: * (and thus presumably busy processing a URI). This information is extracted
0084: * from the crawl controller.
0085: * <p>
0086: * Finally mem-use-KB is extracted from the run time environment
0087: * (<code>Runtime.getRuntime().totalMemory()</code>).
0088: * <p>
0089: * In addition to the data collected for the above logs, various other data
0090: * is gathered and stored by this tracker.
0091: * <ul>
0092: * <li> Successfully downloaded documents per fetch status code
0093: * <li> Successfully downloaded documents per document mime type
0094: * <li> Amount of data per mime type
0095: * <li> Successfully downloaded documents per host
0096: * <li> Amount of data per host
0097: * <li> Disposition of all seeds (this is written to 'reports.log' at end of
0098: * crawl)
0099: * <li> Successfully downloaded documents per host per source
0100: * </ul>
0101: *
0102: * @author Parker Thompson
0103: * @author Kristinn Sigurdsson
0104: *
0105: * @see org.archive.crawler.framework.StatisticsTracking
0106: * @see org.archive.crawler.framework.AbstractTracker
0107: */
0108: public class StatisticsTracker extends AbstractTracker implements
0109: CrawlURIDispositionListener, Serializable {
0110: private static final long serialVersionUID = 8004878315916392305L;
0111:
0112: /**
0113: * Messages from the StatisticsTracker.
0114: */
0115: private final static Logger logger = Logger
0116: .getLogger(StatisticsTracker.class.getName());
0117:
0118: // TODO: Need to be able to specify file where the object will be
0119: // written once the CrawlEnded event occurs
0120:
0121: protected long lastPagesFetchedCount = 0;
0122: protected long lastProcessedBytesCount = 0;
0123:
0124: /*
0125: * Snapshot data.
0126: */
0127: protected long discoveredUriCount = 0;
0128: protected long queuedUriCount = 0;
0129: protected long finishedUriCount = 0;
0130:
0131: protected long downloadedUriCount = 0;
0132: protected long downloadFailures = 0;
0133: protected long downloadDisregards = 0;
0134: protected double docsPerSecond = 0;
0135: protected double currentDocsPerSecond = 0;
0136: protected int currentKBPerSec = 0;
0137: protected long totalKBPerSec = 0;
0138: protected int busyThreads = 0;
0139: protected long totalProcessedBytes = 0;
0140: protected float congestionRatio = 0;
0141: protected long deepestUri;
0142: protected long averageDepth;
0143:
0144: /*
0145: * Cumulative data
0146: */
0147: /** tally sizes novel, verified (same hash), vouched (not-modified) */
0148: protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();
0149:
0150: /** Keep track of the file types we see (mime type -> count) */
0151: protected Hashtable<String, LongWrapper> mimeTypeDistribution = new Hashtable<String, LongWrapper>();
0152: protected Hashtable<String, LongWrapper> mimeTypeBytes = new Hashtable<String, LongWrapper>();
0153:
0154: /** Keep track of fetch status codes */
0155: protected Hashtable<String, LongWrapper> statusCodeDistribution = new Hashtable<String, LongWrapper>();
0156:
0157: /** Keep track of hosts.
0158: *
0159: * Each of these Maps are individually unsynchronized, and cannot
0160: * be trivially synchronized with the Collections wrapper. Thus
0161: * their synchronized access is enforced by this class.
0162: *
0163: * <p>They're transient because usually bigmaps that get reconstituted
0164: * on recover from checkpoint.
0165: */
0166: protected transient Map<String, LongWrapper> hostsDistribution = null;
0167: protected transient Map<String, LongWrapper> hostsBytes = null;
0168: protected transient Map<String, Long> hostsLastFinished = null;
0169:
0170: /** Keep track of URL counts per host per seed */
0171: protected transient Map<String, HashMap<String, LongWrapper>> sourceHostDistribution = null;
0172:
0173: /**
0174: * Record of seeds' latest actions.
0175: */
0176: protected transient Map<String, SeedRecord> processedSeedsRecords;
0177:
0178: // seeds tallies: ONLY UPDATED WHEN SEED REPORT WRITTEN
0179: private int seedsCrawled;
0180: private int seedsNotCrawled;
0181: // sExitMessage: only set at crawl-end
0182: private String sExitMessage = "Before crawl end";
0183:
0184: public StatisticsTracker(String name) {
0185: super (
0186: name,
0187: "A statistics tracker thats integrated into "
0188: + "the web UI and that creates the progress-statistics log.");
0189: }
0190:
0191: public void initialize(CrawlController c)
0192: throws FatalConfigurationException {
0193: super .initialize(c);
0194: try {
0195: this .sourceHostDistribution = c.getBigMap(
0196: "sourceHostDistribution", String.class,
0197: HashMap.class);
0198: this .hostsDistribution = c.getBigMap("hostsDistribution",
0199: String.class, LongWrapper.class);
0200: this .hostsBytes = c.getBigMap("hostsBytes", String.class,
0201: LongWrapper.class);
0202: this .hostsLastFinished = c.getBigMap("hostsLastFinished",
0203: String.class, Long.class);
0204: this .processedSeedsRecords = c.getBigMap(
0205: "processedSeedsRecords", String.class,
0206: SeedRecord.class);
0207: } catch (Exception e) {
0208: throw new FatalConfigurationException("Failed setup of"
0209: + " StatisticsTracker: " + e);
0210: }
0211: controller.addCrawlURIDispositionListener(this );
0212: }
0213:
0214: protected void finalCleanup() {
0215: super .finalCleanup();
0216: if (this .hostsBytes != null) {
0217: this .hostsBytes.clear();
0218: this .hostsBytes = null;
0219: }
0220: if (this .hostsDistribution != null) {
0221: this .hostsDistribution.clear();
0222: this .hostsDistribution = null;
0223: }
0224: if (this .hostsLastFinished != null) {
0225: this .hostsLastFinished.clear();
0226: this .hostsLastFinished = null;
0227: }
0228: if (this .processedSeedsRecords != null) {
0229: this .processedSeedsRecords.clear();
0230: this .processedSeedsRecords = null;
0231: }
0232: if (this .sourceHostDistribution != null) {
0233: this .sourceHostDistribution.clear();
0234: this .sourceHostDistribution = null;
0235: }
0236:
0237: }
0238:
0239: protected synchronized void progressStatisticsEvent(
0240: final EventObject e) {
0241: // This method loads "snapshot" data.
0242: discoveredUriCount = discoveredUriCount();
0243: downloadedUriCount = successfullyFetchedCount();
0244: finishedUriCount = finishedUriCount();
0245: queuedUriCount = queuedUriCount();
0246: downloadFailures = failedFetchAttempts();
0247: downloadDisregards = disregardedFetchAttempts();
0248: totalProcessedBytes = totalBytesCrawled();
0249: congestionRatio = congestionRatio();
0250: deepestUri = deepestUri();
0251: averageDepth = averageDepth();
0252:
0253: if (finishedUriCount() == 0) {
0254: docsPerSecond = 0;
0255: totalKBPerSec = 0;
0256: } else if (getCrawlerTotalElapsedTime() < 1000) {
0257: return; // Not enough time has passed for a decent snapshot.
0258: } else {
0259: docsPerSecond = (double) downloadedUriCount
0260: / (double) (getCrawlerTotalElapsedTime() / 1000);
0261: // Round to nearest long.
0262: totalKBPerSec = (long) (((totalProcessedBytes / 1024) / ((getCrawlerTotalElapsedTime()) / 1000)) + .5);
0263: }
0264:
0265: busyThreads = activeThreadCount();
0266:
0267: if (shouldrun
0268: || (System.currentTimeMillis() - lastLogPointTime) >= 1000) {
0269: // If shouldrun is false there is a chance that the time interval
0270: // since last time is too small for a good sample. We only want
0271: // to update "current" data when the interval is long enough or
0272: // shouldrun is true.
0273: currentDocsPerSecond = 0;
0274: currentKBPerSec = 0;
0275:
0276: // Note time.
0277: long currentTime = System.currentTimeMillis();
0278: long sampleTime = currentTime - lastLogPointTime;
0279:
0280: // if we haven't done anyting or there isn't a reasonable sample
0281: // size give up.
0282: if (sampleTime >= 1000) {
0283: // Update docs/sec snapshot
0284: long currentPageCount = successfullyFetchedCount();
0285: long samplePageCount = currentPageCount
0286: - lastPagesFetchedCount;
0287:
0288: currentDocsPerSecond = (double) samplePageCount
0289: / (double) (sampleTime / 1000);
0290:
0291: lastPagesFetchedCount = currentPageCount;
0292:
0293: // Update kbytes/sec snapshot
0294: long currentProcessedBytes = totalProcessedBytes;
0295: long sampleProcessedBytes = currentProcessedBytes
0296: - lastProcessedBytesCount;
0297:
0298: currentKBPerSec = (int) (((sampleProcessedBytes / 1024) / (sampleTime / 1000)) + .5);
0299:
0300: lastProcessedBytesCount = currentProcessedBytes;
0301: }
0302: }
0303:
0304: if (this .controller != null) {
0305: this .controller
0306: .logProgressStatistics(getProgressStatisticsLine());
0307: }
0308: lastLogPointTime = System.currentTimeMillis();
0309: super .progressStatisticsEvent(e);
0310: }
0311:
0312: /**
0313: * Return one line of current progress-statistics
0314: *
0315: * @param now
0316: * @return String of stats
0317: */
0318: public String getProgressStatisticsLine(Date now) {
0319: return new PaddingStringBuffer()
0320: .append(ArchiveUtils.getLog14Date(now))
0321: .raAppend(32, discoveredUriCount)
0322: .raAppend(44, queuedUriCount)
0323: .raAppend(57, downloadedUriCount)
0324: .raAppend(
0325: 74,
0326: ArchiveUtils.doubleToString(
0327: currentDocsPerSecond, 2)
0328: + "("
0329: + ArchiveUtils.doubleToString(
0330: docsPerSecond, 2) + ")")
0331: .raAppend(85,
0332: currentKBPerSec + "(" + totalKBPerSec + ")")
0333: .raAppend(99, downloadFailures)
0334: .raAppend(113, busyThreads)
0335: .raAppend(
0336: 126,
0337: (Runtime.getRuntime().totalMemory() - Runtime
0338: .getRuntime().freeMemory()) / 1024)
0339: .raAppend(140,
0340: Runtime.getRuntime().totalMemory() / 1024)
0341: .raAppend(153,
0342: ArchiveUtils.doubleToString(congestionRatio, 2))
0343: .raAppend(165, deepestUri).raAppend(177, averageDepth)
0344: .toString();
0345: }
0346:
0347: public Map<String, Number> getProgressStatistics() {
0348: Map<String, Number> stats = new HashMap<String, Number>();
0349: stats.put("discoveredUriCount", new Long(discoveredUriCount));
0350: stats.put("queuedUriCount", new Long(queuedUriCount));
0351: stats.put("downloadedUriCount", new Long(downloadedUriCount));
0352: stats.put("currentDocsPerSecond", new Double(
0353: currentDocsPerSecond));
0354: stats.put("docsPerSecond", new Double(docsPerSecond));
0355: stats.put("totalKBPerSec", new Long(totalKBPerSec));
0356: stats.put("totalProcessedBytes", new Long(totalProcessedBytes));
0357: stats.put("currentKBPerSec", new Long(currentKBPerSec));
0358: stats.put("downloadFailures", new Long(downloadFailures));
0359: stats.put("busyThreads", new Integer(busyThreads));
0360: stats.put("congestionRatio", new Double(congestionRatio));
0361: stats.put("deepestUri", new Long(deepestUri));
0362: stats.put("averageDepth", new Long(averageDepth));
0363: stats.put("totalMemory", new Long(Runtime.getRuntime()
0364: .totalMemory()));
0365: stats.put("freeMemory", new Long(Runtime.getRuntime()
0366: .freeMemory()));
0367: return stats;
0368: }
0369:
0370: /**
0371: * Return one line of current progress-statistics
0372: *
0373: * @return String of stats
0374: */
0375: public String getProgressStatisticsLine() {
0376: return getProgressStatisticsLine(new Date());
0377: }
0378:
0379: public double processedDocsPerSec() {
0380: return docsPerSecond;
0381: }
0382:
0383: public double currentProcessedDocsPerSec() {
0384: return currentDocsPerSecond;
0385: }
0386:
0387: public long processedKBPerSec() {
0388: return totalKBPerSec;
0389: }
0390:
0391: public int currentProcessedKBPerSec() {
0392: return currentKBPerSec;
0393: }
0394:
0395: /** Returns a HashMap that contains information about distributions of
0396: * encountered mime types. Key/value pairs represent
0397: * mime type -> count.
0398: * <p>
0399: * <b>Note:</b> All the values are wrapped with a {@link LongWrapper LongWrapper}
0400: * @return mimeTypeDistribution
0401: */
0402: public Hashtable<String, LongWrapper> getFileDistribution() {
0403: return mimeTypeDistribution;
0404: }
0405:
0406: /**
0407: * Increment a counter for a key in a given HashMap. Used for various
0408: * aggregate data.
0409: *
0410: * As this is used to change Maps which depend on StatisticsTracker
0411: * for their synchronization, this method should only be invoked
0412: * from a a block synchronized on 'this'.
0413: *
0414: * @param map The HashMap
0415: * @param key The key for the counter to be incremented, if it does not
0416: * exist it will be added (set to 1). If null it will
0417: * increment the counter "unknown".
0418: */
0419: protected static void incrementMapCount(
0420: Map<String, LongWrapper> map, String key) {
0421: incrementMapCount(map, key, 1);
0422: }
0423:
0424: /**
0425: * Increment a counter for a key in a given HashMap by an arbitrary amount.
0426: * Used for various aggregate data. The increment amount can be negative.
0427: *
0428: * As this is used to change Maps which depend on StatisticsTracker
0429: * for their synchronization, this method should only be invoked
0430: * from a a block synchronized on 'this'.
0431: *
0432: * @param map
0433: * The HashMap
0434: * @param key
0435: * The key for the counter to be incremented, if it does not exist
0436: * it will be added (set to equal to <code>increment</code>).
0437: * If null it will increment the counter "unknown".
0438: * @param increment
0439: * The amount to increment counter related to the <code>key</code>.
0440: */
0441: protected static void incrementMapCount(
0442: Map<String, LongWrapper> map, String key, long increment) {
0443: if (key == null) {
0444: key = "unknown";
0445: }
0446: LongWrapper lw = (LongWrapper) map.get(key);
0447: if (lw == null) {
0448: map.put(key, new LongWrapper(increment));
0449: } else {
0450: lw.longValue += increment;
0451: }
0452: }
0453:
0454: /**
0455: * Sort the entries of the given HashMap in descending order by their
0456: * values, which must be longs wrapped with <code>LongWrapper</code>.
0457: * <p>
0458: * Elements are sorted by value from largest to smallest. Equal values are
0459: * sorted in an arbitrary, but consistent manner by their keys. Only items
0460: * with identical value and key are considered equal.
0461: *
0462: * If the passed-in map requires access to be synchronized, the caller
0463: * should ensure this synchronization.
0464: *
0465: * @param mapOfLongWrapperValues
0466: * Assumes values are wrapped with LongWrapper.
0467: * @return a sorted set containing the same elements as the map.
0468: */
0469: public TreeMap<String, LongWrapper> getReverseSortedCopy(
0470: final Map<String, LongWrapper> mapOfLongWrapperValues) {
0471: TreeMap<String, LongWrapper> sortedMap = new TreeMap<String, LongWrapper>(
0472: new Comparator<String>() {
0473: public int compare(String e1, String e2) {
0474: long firstVal = mapOfLongWrapperValues.get(e1).longValue;
0475: long secondVal = mapOfLongWrapperValues.get(e2).longValue;
0476: if (firstVal < secondVal) {
0477: return 1;
0478: }
0479: if (secondVal < firstVal) {
0480: return -1;
0481: }
0482: // If the values are the same, sort by keys.
0483: return e1.compareTo(e2);
0484: }
0485: });
0486: try {
0487: sortedMap.putAll(mapOfLongWrapperValues);
0488: } catch (UnsupportedOperationException e) {
0489: Iterator<String> i = mapOfLongWrapperValues.keySet()
0490: .iterator();
0491: for (; i.hasNext();) {
0492: // Ok. Try doing it the slow way then.
0493: String key = i.next();
0494: sortedMap.put(key, mapOfLongWrapperValues.get(key));
0495: }
0496: }
0497: return sortedMap;
0498: }
0499:
0500: /**
0501: * Return a HashMap representing the distribution of status codes for
0502: * successfully fetched curis, as represented by a hashmap where key ->
0503: * val represents (string)code -> (integer)count.
0504: *
0505: * <b>Note: </b> All the values are wrapped with a
0506: * {@link LongWrapper LongWrapper}
0507: *
0508: * @return statusCodeDistribution
0509: */
0510: public Hashtable<String, LongWrapper> getStatusCodeDistribution() {
0511: return statusCodeDistribution;
0512: }
0513:
0514: /**
0515: * Returns the time (in millisec) when a URI belonging to a given host was
0516: * last finished processing.
0517: *
0518: * @param host The host to look up time of last completed URI.
0519: * @return Returns the time (in millisec) when a URI belonging to a given
0520: * host was last finished processing. If no URI has been completed for host
0521: * -1 will be returned.
0522: */
0523: public long getHostLastFinished(String host) {
0524: Long l = null;
0525: synchronized (hostsLastFinished) {
0526: l = (Long) hostsLastFinished.get(host);
0527: }
0528: return (l != null) ? l.longValue() : -1;
0529: }
0530:
0531: /**
0532: * Returns the accumulated number of bytes downloaded from a given host.
0533: * @param host name of the host
0534: * @return the accumulated number of bytes downloaded from a given host
0535: */
0536: public long getBytesPerHost(String host) {
0537: synchronized (hostsBytes) {
0538: return ((LongWrapper) hostsBytes.get(host)).longValue;
0539: }
0540: }
0541:
0542: /**
0543: * Returns the accumulated number of bytes from files of a given file type.
0544: * @param filetype Filetype to check.
0545: * @return the accumulated number of bytes from files of a given mime type
0546: */
0547: public long getBytesPerFileType(String filetype) {
0548: return ((LongWrapper) mimeTypeBytes.get(filetype)).longValue;
0549: }
0550:
0551: /**
0552: * Get the total number of ToeThreads (sleeping and active)
0553: *
0554: * @return The total number of ToeThreads
0555: */
0556: public int threadCount() {
0557: return this .controller != null ? controller.getToeCount() : 0;
0558: }
0559:
0560: /**
0561: * @return Current thread count (or zero if can't figure it out).
0562: */
0563: public int activeThreadCount() {
0564: return this .controller != null ? controller.getActiveToeCount()
0565: : 0;
0566: // note: reuse of old busy value seemed misleading: anyone asking
0567: // for thread count when paused or stopped still wants accurate reading
0568: }
0569:
0570: /**
0571: * This returns the number of completed URIs as a percentage of the total
0572: * number of URIs encountered (should be inverse to the discovery curve)
0573: *
0574: * @return The number of completed URIs as a percentage of the total
0575: * number of URIs encountered
0576: */
0577: public int percentOfDiscoveredUrisCompleted() {
0578: long completed = finishedUriCount();
0579: long total = discoveredUriCount();
0580:
0581: if (total == 0) {
0582: return 0;
0583: }
0584:
0585: return (int) (100 * completed / total);
0586: }
0587:
0588: /**
0589: * Number of <i>discovered</i> URIs.
0590: *
0591: * <p>If crawl not running (paused or stopped) this will return the value of
0592: * the last snapshot.
0593: *
0594: * @return A count of all uris encountered
0595: *
0596: * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
0597: */
0598: public long discoveredUriCount() {
0599: // While shouldrun is true we can use info direct from the crawler.
0600: // After that our last snapshot will have to do.
0601: return shouldrun && this .controller != null
0602: && this .controller.getFrontier() != null ? controller
0603: .getFrontier().discoveredUriCount()
0604: : discoveredUriCount;
0605: }
0606:
0607: /**
0608: * Number of URIs that have <i>finished</i> processing.
0609: *
0610: * @return Number of URIs that have finished processing
0611: *
0612: * @see org.archive.crawler.framework.Frontier#finishedUriCount()
0613: */
0614: public long finishedUriCount() {
0615: return shouldrun && this .controller != null
0616: && this .controller.getFrontier() != null ? controller
0617: .getFrontier().finishedUriCount() : finishedUriCount;
0618: }
0619:
0620: /**
0621: * Get the total number of failed fetch attempts (connection failures -> give up, etc)
0622: *
0623: * @return The total number of failed fetch attempts
0624: */
0625: public long failedFetchAttempts() {
0626: // While shouldrun is true we can use info direct from the crawler.
0627: // After that our last snapshot will have to do.
0628: return shouldrun && this .controller != null
0629: && this .controller.getFrontier() != null ? controller
0630: .getFrontier().failedFetchCount() : downloadFailures;
0631: }
0632:
0633: /**
0634: * Get the total number of failed fetch attempts (connection failures -> give up, etc)
0635: *
0636: * @return The total number of failed fetch attempts
0637: */
0638: public long disregardedFetchAttempts() {
0639: // While shouldrun is true we can use info direct from the crawler.
0640: // After that our last snapshot will have to do.
0641: return shouldrun && this .controller != null
0642: && this .controller.getFrontier() != null ? controller
0643: .getFrontier().disregardedUriCount()
0644: : downloadDisregards;
0645: }
0646:
0647: public long successfullyFetchedCount() {
0648: // While shouldrun is true we can use info direct from the crawler.
0649: // After that our last snapshot will have to do.
0650: return shouldrun && this .controller != null
0651: && this .controller.getFrontier() != null ? controller
0652: .getFrontier().succeededFetchCount()
0653: : downloadedUriCount;
0654: }
0655:
0656: public long totalCount() {
0657: return queuedUriCount() + activeThreadCount()
0658: + successfullyFetchedCount();
0659: }
0660:
0661: /**
0662: * Ratio of number of threads that would theoretically allow
0663: * maximum crawl progress (if each was as productive as current
0664: * threads), to current number of threads.
0665: *
0666: * @return float congestion ratio
0667: */
0668: public float congestionRatio() {
0669: // While shouldrun is true we can use info direct from the crawler.
0670: // After that our last snapshot will have to do.
0671: return shouldrun && this .controller != null
0672: && this .controller.getFrontier() != null ? controller
0673: .getFrontier().congestionRatio() : congestionRatio;
0674: }
0675:
0676: /**
0677: * Ordinal position of the 'deepest' URI eligible
0678: * for crawling. Essentially, the length of the longest
0679: * frontier internal queue.
0680: *
0681: * @return long URI count to deepest URI
0682: */
0683: public long deepestUri() {
0684: // While shouldrun is true we can use info direct from the crawler.
0685: // After that our last snapshot will have to do.
0686: return shouldrun && this .controller != null
0687: && this .controller.getFrontier() != null ? controller
0688: .getFrontier().deepestUri() : deepestUri;
0689: }
0690:
0691: /**
0692: * Average depth of the last URI in all eligible queues.
0693: * That is, the average length of all eligible queues.
0694: *
0695: * @return long average depth of last URIs in queues
0696: */
0697: public long averageDepth() {
0698: // While shouldrun is true we can use info direct from the crawler.
0699: // After that our last snapshot will have to do.
0700: return shouldrun && this .controller != null
0701: && this .controller.getFrontier() != null ? controller
0702: .getFrontier().averageDepth() : averageDepth;
0703: }
0704:
0705: /**
0706: * Number of URIs <i>queued</i> up and waiting for processing.
0707: *
0708: * <p>If crawl not running (paused or stopped) this will return the value
0709: * of the last snapshot.
0710: *
0711: * @return Number of URIs queued up and waiting for processing.
0712: *
0713: * @see org.archive.crawler.framework.Frontier#queuedUriCount()
0714: */
0715: public long queuedUriCount() {
0716: // While shouldrun is true we can use info direct from the crawler.
0717: // After that our last snapshot will have to do.
0718: return shouldrun && this .controller != null
0719: && this .controller.getFrontier() != null ? controller
0720: .getFrontier().queuedUriCount() : queuedUriCount;
0721: }
0722:
0723: /** @deprecated use totalBytesCrawled */
0724: public long totalBytesWritten() {
0725: // return totalBytesCrawled();
0726: return shouldrun && this .controller != null
0727: && this .controller.getFrontier() != null ? controller
0728: .getFrontier().totalBytesWritten()
0729: : totalProcessedBytes;
0730: }
0731:
0732: public long totalBytesCrawled() {
0733: return shouldrun ? crawledBytes.getTotal()
0734: : totalProcessedBytes;
0735: }
0736:
0737: public String crawledBytesSummary() {
0738: return crawledBytes.summary();
0739: }
0740:
0741: /**
0742: * If the curi is a seed, we update the processedSeeds table.
0743: *
0744: * @param curi The CrawlURI that may be a seed.
0745: * @param disposition The dispositino of the CrawlURI.
0746: */
0747: private void handleSeed(CrawlURI curi, String disposition) {
0748: if (curi.isSeed()) {
0749: SeedRecord sr = new SeedRecord(curi, disposition);
0750: processedSeedsRecords.put(sr.getUri(), sr);
0751: }
0752: }
0753:
0754: public void crawledURISuccessful(CrawlURI curi) {
0755: handleSeed(curi, SEED_DISPOSITION_SUCCESS);
0756: // save crawled bytes tally
0757: crawledBytes.accumulate(curi);
0758:
0759: // Save status codes
0760: incrementMapCount(statusCodeDistribution, Integer.toString(curi
0761: .getFetchStatus()));
0762:
0763: // Save mime types
0764: String mime = MimetypeUtils.truncate(curi.getContentType());
0765: incrementMapCount(mimeTypeDistribution, mime);
0766: incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
0767:
0768: // Save hosts stats.
0769: saveHostStats((curi.getFetchStatus() == 1) ? "dns:"
0770: : this .controller.getServerCache().getHostFor(curi)
0771: .getHostName(), curi.getContentSize());
0772:
0773: if (curi.containsKey(CrawlURI.A_SOURCE_TAG)) {
0774: saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG),
0775: this .controller.getServerCache().getHostFor(curi)
0776: .getHostName());
0777: }
0778: }
0779:
0780: protected void saveSourceStats(String source, String hostname) {
0781: synchronized (sourceHostDistribution) {
0782: HashMap<String, LongWrapper> hostUriCount = sourceHostDistribution
0783: .get(source);
0784: if (hostUriCount == null) {
0785: hostUriCount = new HashMap<String, LongWrapper>();
0786: }
0787: // TODO: Dan suggests we don't need a hashtable value. Might
0788: // be faster if we went without. Could just have keys of:
0789: // seed | host (concatenated as string)
0790: // and values of:
0791: // #urls
0792: incrementMapCount(hostUriCount, hostname);
0793: sourceHostDistribution.put(source, hostUriCount);
0794: }
0795: }
0796:
0797: protected void saveHostStats(String hostname, long size) {
0798: synchronized (hostsDistribution) {
0799: incrementMapCount(hostsDistribution, hostname);
0800: }
0801: synchronized (hostsBytes) {
0802: incrementMapCount(hostsBytes, hostname, size);
0803: }
0804: synchronized (hostsLastFinished) {
0805: hostsLastFinished.put(hostname, new Long(System
0806: .currentTimeMillis()));
0807: }
0808: }
0809:
0810: public void crawledURINeedRetry(CrawlURI curi) {
0811: handleSeed(curi, SEED_DISPOSITION_RETRY);
0812: }
0813:
0814: public void crawledURIDisregard(CrawlURI curi) {
0815: handleSeed(curi, SEED_DISPOSITION_DISREGARD);
0816: }
0817:
0818: public void crawledURIFailure(CrawlURI curi) {
0819: handleSeed(curi, SEED_DISPOSITION_FAILURE);
0820: }
0821:
0822: /**
0823: * Get a seed iterator for the job being monitored.
0824: *
0825: * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
0826: * UURIs like the Scope seed iterator. The strings are equal to the URIs'
0827: * getURIString() values.
0828: * @return the seed iterator
0829: * FIXME: Consider using TransformingIterator here
0830: */
0831: public Iterator<String> getSeeds() {
0832: List<String> seedsCopy = new Vector<String>();
0833: Iterator<UURI> i = controller.getScope().seedsIterator();
0834: while (i.hasNext()) {
0835: seedsCopy.add(i.next().toString());
0836: }
0837: return seedsCopy.iterator();
0838: }
0839:
0840: public Iterator getSeedRecordsSortedByStatusCode() {
0841: return getSeedRecordsSortedByStatusCode(getSeeds());
0842: }
0843:
0844: protected Iterator<SeedRecord> getSeedRecordsSortedByStatusCode(
0845: Iterator<String> i) {
0846: TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(
0847: new Comparator<SeedRecord>() {
0848: public int compare(SeedRecord sr1, SeedRecord sr2) {
0849: int code1 = sr1.getStatusCode();
0850: int code2 = sr2.getStatusCode();
0851: if (code1 == code2) {
0852: // If the values are equal, sort by URIs.
0853: return sr1.getUri().compareTo(sr2.getUri());
0854: }
0855: // mirror and shift the nubmer line so as to
0856: // place zero at the beginning, then all negatives
0857: // in order of ascending absolute value, then all
0858: // positives descending
0859: code1 = -code1 - Integer.MAX_VALUE;
0860: code2 = -code2 - Integer.MAX_VALUE;
0861:
0862: return new Integer(code1)
0863: .compareTo(new Integer(code2));
0864: }
0865: });
0866: while (i.hasNext()) {
0867: String seed = i.next();
0868: SeedRecord sr = (SeedRecord) processedSeedsRecords
0869: .get(seed);
0870: if (sr == null) {
0871: sr = new SeedRecord(seed,
0872: SEED_DISPOSITION_NOT_PROCESSED);
0873: processedSeedsRecords.put(seed, sr);
0874: }
0875: sortedSet.add(sr);
0876: }
0877: return sortedSet.iterator();
0878: }
0879:
0880: public void crawlEnded(String message) {
0881: logger.info("Entered crawlEnded");
0882: this .sExitMessage = message; // held for reference by reports
0883: super .crawlEnded(message);
0884: logger.info("Leaving crawlEnded");
0885: }
0886:
0887: /**
0888: * @param writer Where to write.
0889: */
0890: protected void writeSeedsReportTo(PrintWriter writer) {
0891: // Build header.
0892: writer.print("[code] [status] [seed] [redirect]\n");
0893:
0894: seedsCrawled = 0;
0895: seedsNotCrawled = 0;
0896: for (Iterator i = getSeedRecordsSortedByStatusCode(getSeeds()); i
0897: .hasNext();) {
0898: SeedRecord sr = (SeedRecord) i.next();
0899: writer.print(sr.getStatusCode());
0900: writer.print(" ");
0901: if ((sr.getStatusCode() > 0)) {
0902: seedsCrawled++;
0903: writer.print("CRAWLED");
0904: } else {
0905: seedsNotCrawled++;
0906: writer.print("NOTCRAWLED");
0907: }
0908: writer.print(" ");
0909: writer.print(sr.getUri());
0910: if (sr.getRedirectUri() != null) {
0911: writer.print(" ");
0912: writer.print(sr.getRedirectUri());
0913: }
0914: writer.print("\n");
0915: }
0916: }
0917:
0918: protected void writeSourceReportTo(PrintWriter writer) {
0919:
0920: writer.print("[source] [host] [#urls]\n");
0921: // for each source
0922: for (Iterator i = sourceHostDistribution.keySet().iterator(); i
0923: .hasNext();) {
0924: Object sourceKey = i.next();
0925: Map<String, LongWrapper> hostCounts = (Map<String, LongWrapper>) sourceHostDistribution
0926: .get(sourceKey);
0927: // sort hosts by #urls
0928: SortedMap sortedHostCounts = getReverseSortedHostCounts(hostCounts);
0929: // for each host
0930: for (Iterator j = sortedHostCounts.keySet().iterator(); j
0931: .hasNext();) {
0932: Object hostKey = j.next();
0933: LongWrapper hostCount = (LongWrapper) hostCounts
0934: .get(hostKey);
0935: writer.print(sourceKey.toString());
0936: writer.print(" ");
0937: writer.print(hostKey.toString());
0938: writer.print(" ");
0939: writer.print(hostCount.longValue);
0940: writer.print("\n");
0941: }
0942: }
0943: }
0944:
0945: /**
0946: * Return a copy of the hosts distribution in reverse-sorted (largest first)
0947: * order.
0948: *
0949: * @return SortedMap of hosts distribution
0950: */
0951: public SortedMap getReverseSortedHostCounts(
0952: Map<String, LongWrapper> hostCounts) {
0953: synchronized (hostCounts) {
0954: return getReverseSortedCopy(hostCounts);
0955: }
0956: }
0957:
0958: protected void writeHostsReportTo(PrintWriter writer) {
0959: SortedMap hd = getReverseSortedHostsDistribution();
0960: // header
0961: writer.print("[#urls] [#bytes] [host]\n");
0962: for (Iterator i = hd.keySet().iterator(); i.hasNext();) {
0963: // Key is 'host'.
0964: Object key = i.next();
0965: if (hd.get(key) != null) {
0966: writer.print(((LongWrapper) hd.get(key)).longValue);
0967: } else {
0968: writer.print("-");
0969: }
0970: writer.print(" ");
0971: writer.print(getBytesPerHost((String) key));
0972: writer.print(" ");
0973: writer.print((String) key);
0974: writer.print("\n");
0975: }
0976: }
0977:
0978: /**
0979: * Return a copy of the hosts distribution in reverse-sorted
0980: * (largest first) order.
0981: * @return SortedMap of hosts distribution
0982: */
0983: public SortedMap getReverseSortedHostsDistribution() {
0984: synchronized (hostsDistribution) {
0985: return getReverseSortedCopy(hostsDistribution);
0986: }
0987: }
0988:
0989: protected void writeMimetypesReportTo(PrintWriter writer) {
0990: // header
0991: writer.print("[#urls] [#bytes] [mime-types]\n");
0992: TreeMap fd = getReverseSortedCopy(getFileDistribution());
0993: for (Iterator i = fd.keySet().iterator(); i.hasNext();) {
0994: Object key = i.next();
0995: // Key is mime type.
0996: writer.print(Long
0997: .toString(((LongWrapper) fd.get(key)).longValue));
0998: writer.print(" ");
0999: writer.print(Long
1000: .toString(getBytesPerFileType((String) key)));
1001: writer.print(" ");
1002: writer.print((String) key);
1003: writer.print("\n");
1004: }
1005: }
1006:
1007: protected void writeResponseCodeReportTo(PrintWriter writer) {
1008: // Build header.
1009: writer.print("[rescode] [#urls]\n");
1010: TreeMap scd = getReverseSortedCopy(getStatusCodeDistribution());
1011: for (Iterator i = scd.keySet().iterator(); i.hasNext();) {
1012: Object key = i.next();
1013: writer.print((String) key);
1014: writer.print(" ");
1015: writer.print(Long
1016: .toString(((LongWrapper) scd.get(key)).longValue));
1017: writer.print("\n");
1018: }
1019: }
1020:
1021: protected void writeCrawlReportTo(PrintWriter writer) {
1022: writer.print("Crawl Name: "
1023: + controller.getOrder().getCrawlOrderName());
1024: writer.print("\nCrawl Status: " + sExitMessage);
1025: writer
1026: .print("\nDuration Time: "
1027: + ArchiveUtils
1028: .formatMillisecondsToConventional(crawlDuration()));
1029: writer.print("\nTotal Seeds Crawled: " + seedsCrawled);
1030: writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled);
1031: // hostsDistribution contains all hosts crawled plus an entry for dns.
1032: writer.print("\nTotal Hosts Crawled: "
1033: + (hostsDistribution.size() - 1));
1034: writer.print("\nTotal Documents Crawled: " + finishedUriCount);
1035: writer.print("\nProcessed docs/sec: "
1036: + ArchiveUtils.doubleToString(docsPerSecond, 2));
1037: writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec);
1038: writer.print("\nTotal Raw Data Size in Bytes: "
1039: + totalProcessedBytes
1040: + " ("
1041: + ArchiveUtils
1042: .formatBytesForDisplay(totalProcessedBytes)
1043: + ") \n");
1044: writer.print("Novel Bytes: "
1045: + crawledBytes.get(CrawledBytesHistotable.NOVEL)
1046: + " ("
1047: + ArchiveUtils.formatBytesForDisplay(crawledBytes
1048: .get(CrawledBytesHistotable.NOVEL)) + ") \n");
1049: if (crawledBytes.containsKey(CrawledBytesHistotable.DUPLICATE)) {
1050: writer.print("Duplicate-by-hash Bytes: "
1051: + crawledBytes
1052: .get(CrawledBytesHistotable.DUPLICATE)
1053: + " ("
1054: + ArchiveUtils.formatBytesForDisplay(crawledBytes
1055: .get(CrawledBytesHistotable.DUPLICATE))
1056: + ") \n");
1057: }
1058: if (crawledBytes
1059: .containsKey(CrawledBytesHistotable.NOTMODIFIED)) {
1060: writer.print("Not-modified Bytes: "
1061: + crawledBytes
1062: .get(CrawledBytesHistotable.NOTMODIFIED)
1063: + " ("
1064: + ArchiveUtils.formatBytesForDisplay(crawledBytes
1065: .get(CrawledBytesHistotable.NOTMODIFIED))
1066: + ") \n");
1067: }
1068: }
1069:
1070: protected void writeProcessorsReportTo(PrintWriter writer) {
1071: controller.reportTo(CrawlController.PROCESSORS_REPORT, writer);
1072: }
1073:
1074: protected void writeReportFile(String reportName, String filename) {
1075: File f = new File(controller.getDisk().getPath(), filename);
1076: try {
1077: PrintWriter bw = new PrintWriter(new FileWriter(f));
1078: writeReportTo(reportName, bw);
1079: bw.close();
1080: controller.addToManifest(f.getAbsolutePath(),
1081: CrawlController.MANIFEST_REPORT_FILE, true);
1082: } catch (IOException e) {
1083: logger.log(Level.SEVERE, "Unable to write "
1084: + f.getAbsolutePath() + " at the end of crawl.", e);
1085: }
1086: logger.info("wrote report: " + f.getAbsolutePath());
1087: }
1088:
1089: /**
1090: * @param writer Where to write.
1091: */
1092: protected void writeManifestReportTo(PrintWriter writer) {
1093: controller.reportTo(CrawlController.MANIFEST_REPORT, writer);
1094: }
1095:
1096: /**
1097: * @param reportName Name of report.
1098: * @param w Where to write.
1099: */
1100: private void writeReportTo(String reportName, PrintWriter w) {
1101: if ("hosts".equals(reportName)) {
1102: writeHostsReportTo(w);
1103: } else if ("mime types".equals(reportName)) {
1104: writeMimetypesReportTo(w);
1105: } else if ("response codes".equals(reportName)) {
1106: writeResponseCodeReportTo(w);
1107: } else if ("seeds".equals(reportName)) {
1108: writeSeedsReportTo(w);
1109: } else if ("crawl".equals(reportName)) {
1110: writeCrawlReportTo(w);
1111: } else if ("processors".equals(reportName)) {
1112: writeProcessorsReportTo(w);
1113: } else if ("manifest".equals(reportName)) {
1114: writeManifestReportTo(w);
1115: } else if ("frontier".equals(reportName)) {
1116: writeFrontierReportTo(w);
1117: } else if ("source".equals(reportName)) {
1118: writeSourceReportTo(w);
1119: }// / TODO else default/error
1120: }
1121:
1122: /**
1123: * Write the Frontier's 'nonempty' report (if available)
1124: * @param writer to report to
1125: */
1126: protected void writeFrontierReportTo(PrintWriter writer) {
1127: if (controller.getFrontier().isEmpty()) {
1128: writer.println("frontier empty");
1129: } else {
1130: controller.getFrontier().reportTo("nonempty", writer);
1131: }
1132: }
1133:
1134: /**
1135: * Run the reports.
1136: */
1137: public void dumpReports() {
1138: // Add all files mentioned in the crawl order to the
1139: // manifest set.
1140: controller.addOrderToManifest();
1141: writeReportFile("hosts", "hosts-report.txt");
1142: writeReportFile("mime types", "mimetype-report.txt");
1143: writeReportFile("response codes", "responsecode-report.txt");
1144: writeReportFile("seeds", "seeds-report.txt");
1145: writeReportFile("crawl", "crawl-report.txt");
1146: writeReportFile("processors", "processors-report.txt");
1147: writeReportFile("manifest", "crawl-manifest.txt");
1148: writeReportFile("frontier", "frontier-report.txt");
1149: if (!sourceHostDistribution.isEmpty()) {
1150: writeReportFile("source", "source-report.txt");
1151: }
1152: // TODO: Save object to disk?
1153: }
1154:
1155: public void crawlCheckpoint(File cpDir) throws Exception {
1156: // CrawlController is managing the checkpointing of this object.
1157: logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
1158: }
1159: }
|