01: package org.archive.crawler.util;
02:
03: import org.apache.commons.httpclient.HttpStatus;
04: import org.archive.crawler.datamodel.CoreAttributeConstants;
05: import org.archive.crawler.datamodel.CrawlURI;
06: import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
07: import org.archive.util.Accumulator;
08: import org.archive.util.ArchiveUtils;
09: import org.archive.util.Histotable;
10:
11: public class CrawledBytesHistotable extends Histotable<String>
12: implements Accumulator<CrawlURI>, CoreAttributeConstants {
13: private static final long serialVersionUID = 7923431123239026213L;
14:
15: public static final String NOTMODIFIED = "not-modified";
16: public static final String DUPLICATE = "dup-by-hash";
17: public static final String NOVEL = "novel";
18:
19: public CrawledBytesHistotable() {
20: super ();
21: tally(NOVEL, 0);
22: }
23:
24: public void accumulate(CrawlURI curi) {
25: if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
26: tally(NOTMODIFIED, curi.getContentSize());
27: } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
28: tally(DUPLICATE, curi.getContentSize());
29: } else {
30: tally(NOVEL, curi.getContentSize());
31: }
32: }
33:
34: public String summary() {
35: StringBuilder sb = new StringBuilder();
36: sb.append(ArchiveUtils.formatBytesForDisplay(getTotal()));
37: sb.append(" crawled (");
38: sb.append(ArchiveUtils.formatBytesForDisplay(get(NOVEL)));
39: sb.append(" novel");
40: if (get(DUPLICATE) != null) {
41: sb.append(", ");
42: sb.append(ArchiveUtils
43: .formatBytesForDisplay(get(DUPLICATE)));
44: sb.append(" ");
45: sb.append(DUPLICATE);
46: }
47: if (get(NOTMODIFIED) != null) {
48: sb.append(", ");
49: sb.append(ArchiveUtils
50: .formatBytesForDisplay(get(NOTMODIFIED)));
51: sb.append(" ");
52: sb.append(NOTMODIFIED);
53: }
54: sb.append(")");
55: return sb.toString();
56: }
57: }
|