001: /* StatisticsSummary
002: *
003: * $Id: StatisticsSummary.java 4666 2006-09-26 17:53:28Z paul_jack $$
004: *
005: * Created on July 27, 2006
006: *
007: * Copyright (C) 2006 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.admin;
026:
027: import java.io.File;
028: import java.io.FileReader;
029: import java.io.BufferedReader;
030: import java.io.IOException;
031: import java.util.Comparator;
032: import java.util.Hashtable;
033: import java.util.Iterator;
034: import java.util.Map;
035: import java.util.SortedMap;
036: import java.util.TreeMap;
037: import java.util.TreeSet;
038: import java.util.logging.Level;
039: import java.util.logging.Logger;
040:
041: import org.archive.util.LongWrapper;
042:
043: /**
044: * This class provides descriptive statistics of a finished crawl job by
045: * using the crawl report files generated by StatisticsTracker. Any formatting
046: * changes to the way StatisticsTracker writes to the summary crawl reports will
047: * require changes to this class.
048: * <p>
049: * The following statistics are accessible from this class:
050: * <ul>
051: * <li> Successfully downloaded documents per fetch status code
052: * <li> Successfully downloaded documents per document mime type
053: * <li> Amount of data per mime type
054: * <li> Successfully downloaded documents per host
055: * <li> Amount of data per host
056: * <li> Successfully downloaded documents per top-level domain name (TLD)
057: * <li> Disposition of all seeds
058: * <li> Successfully downloaded documents per host per source
059: * </ul>
060: *
061: * <p>TODO: Make it so summarizing is not done all in RAM so we avoid
062: * OOME.
063: *
064: * @author Frank McCown
065: *
066: * @see org.archive.crawler.admin.StatisticsTracker
067: */
068: public class StatisticsSummary {
069: /**
070: * Messages from the StatisticsSummary.
071: */
072: private final static Logger logger = Logger
073: .getLogger(StatisticsSummary.class.getName());
074:
075: private boolean stats = true;
076:
077: /** Crawl job whose summary we want to view */
078: private CrawlJob cjob;
079:
080: protected long totalDnsStatusCodeDocuments = 0;
081: protected long totalStatusCodeDocuments = 0;
082: protected long totalFileTypeDocuments = 0;
083: protected long totalMimeTypeDocuments = 0;
084: protected long totalDnsMimeTypeDocuments = 0;
085: protected long totalDnsHostDocuments = 0;
086: protected long totalHostDocuments = 0;
087: protected long totalMimeSize = 0;
088: protected long totalDnsMimeSize = 0;
089: protected long totalHostSize = 0;
090: protected long totalDnsHostSize = 0;
091: protected long totalTldDocuments = 0;
092: protected long totalTldSize = 0;
093: protected long totalHosts = 0;
094:
095: protected String durationTime;
096: protected String processedDocsPerSec;
097: protected String bandwidthKbytesPerSec;
098: protected String totalDataWritten;
099:
100: /** Keep track of the file types we see (mime type -> count) */
101: protected Hashtable<String, LongWrapper> mimeTypeDistribution = new Hashtable<String, LongWrapper>();
102: protected Hashtable<String, LongWrapper> mimeTypeBytes = new Hashtable<String, LongWrapper>();
103: protected Hashtable<String, LongWrapper> mimeTypeDnsDistribution = new Hashtable<String, LongWrapper>();
104: protected Hashtable<String, LongWrapper> mimeTypeDnsBytes = new Hashtable<String, LongWrapper>();
105:
106: /** Keep track of status codes */
107: protected Hashtable<String, LongWrapper> statusCodeDistribution = new Hashtable<String, LongWrapper>();
108: protected Hashtable<String, LongWrapper> dnsStatusCodeDistribution = new Hashtable<String, LongWrapper>();
109:
110: /** Keep track of hosts */
111: protected Hashtable<String, LongWrapper> hostsDistribution = new Hashtable<String, LongWrapper>();
112: protected Hashtable<String, LongWrapper> hostsBytes = new Hashtable<String, LongWrapper>();
113: protected Hashtable<String, LongWrapper> hostsDnsDistribution = new Hashtable<String, LongWrapper>();
114: protected Hashtable<String, LongWrapper> hostsDnsBytes = new Hashtable<String, LongWrapper>();
115:
116: /** Keep track of TLDs */
117: protected Hashtable<String, LongWrapper> tldDistribution = new Hashtable<String, LongWrapper>();
118: protected Hashtable<String, LongWrapper> tldBytes = new Hashtable<String, LongWrapper>();
119: protected Hashtable<String, LongWrapper> tldHostDistribution = new Hashtable<String, LongWrapper>();
120:
121: /** Keep track of processed seeds */
122: protected transient Map<String, SeedRecord> processedSeedsRecords = new Hashtable<String, SeedRecord>();
123:
124: /**
125: * Constructor
126: *
127: * @param cjob
128: * Completed crawl job
129: */
130: public StatisticsSummary(CrawlJob cjob) {
131: this .cjob = cjob;
132:
133: // Read all stats for this crawl job
134: this .stats = calculateStatusCodeDistribution();
135: if (calculateMimeTypeDistribution()) {
136: this .stats = true;
137: }
138: if (calculateHostsDistribution()) {
139: this .stats = true;
140: }
141: if (readCrawlReport()) {
142: this .stats = true;
143: }
144: if (readSeedReport()) {
145: this .stats = true;
146: }
147: }
148:
149: /**
150: * Increment a counter for a key in a given HashMap. Used for various
151: * aggregate data.
152: *
153: * @param map The HashMap
154: * @param key The key for the counter to be incremented, if it does not
155: * exist it will be added (set to 1). If null it will
156: * increment the counter "unknown".
157: */
158: protected static void incrementMapCount(
159: Map<String, LongWrapper> map, String key) {
160: incrementMapCount(map, key, 1);
161: }
162:
163: /**
164: * Increment a counter for a key in a given HashMap by an arbitrary amount.
165: * Used for various aggregate data. The increment amount can be negative.
166: *
167: * @param map
168: * The HashMap
169: * @param key
170: * The key for the counter to be incremented, if it does not
171: * exist it will be added (set to equal to
172: * <code>increment</code>).
173: * If null it will increment the counter "unknown".
174: * @param increment
175: * The amount to increment counter related to the
176: * <code>key</code>.
177: */
178: protected static void incrementMapCount(
179: Map<String, LongWrapper> map, String key, long increment) {
180: if (key == null) {
181: key = "unknown";
182: }
183: LongWrapper lw = map.get(key);
184: if (lw == null) {
185: map.put(key, new LongWrapper(increment));
186: } else {
187: lw.longValue += increment;
188: }
189: }
190:
191: /** Returns a HashMap that contains information about distributions of
192: * encountered mime types. Key/value pairs represent
193: * mime type -> count.
194: * <p>
195: * <b>Note:</b> All the values are wrapped with a
196: * {@link LongWrapper LongWrapper}
197: * @return mimeTypeDistribution
198: */
199: public Hashtable getMimeDistribution() {
200: return mimeTypeDistribution;
201: }
202:
203: public long getTotalMimeTypeDocuments() {
204: return totalMimeTypeDocuments;
205: }
206:
207: public long getTotalDnsMimeTypeDocuments() {
208: return totalDnsMimeTypeDocuments;
209: }
210:
211: public long getTotalMimeSize() {
212: return totalMimeSize;
213: }
214:
215: public long getTotalDnsMimeSize() {
216: return totalDnsMimeSize;
217: }
218:
219: /**
220: * Return a HashMap representing the distribution of HTTP status codes for
221: * successfully fetched curis, as represented by a hashmap where key ->
222: * val represents (string)code -> (integer)count.
223: *
224: * <b>Note: </b> All the values are wrapped with a
225: * {@link LongWrapper LongWrapper}
226: *
227: * @return statusCodeDistribution
228: */
229: public Hashtable getStatusCodeDistribution() {
230: return statusCodeDistribution;
231: }
232:
233: /**
234: * Return a HashMap representing the distribution of DNS status codes for
235: * successfully fetched curis, as represented by a hashmap where key ->
236: * val represents (string)code -> (integer)count.
237: *
238: * <b>Note: </b> All the values are wrapped with a
239: * {@link LongWrapper LongWrapper}
240: *
241: * @return dnsStatusCodeDistribution
242: */
243: public Hashtable getDnsStatusCodeDistribution() {
244: return dnsStatusCodeDistribution;
245: }
246:
247: public Hashtable getDnsMimeDistribution() {
248: return mimeTypeDnsDistribution;
249: }
250:
251: public long getTotalDnsStatusCodeDocuments() {
252: return totalDnsStatusCodeDocuments;
253: }
254:
255: public long getTotalStatusCodeDocuments() {
256: return totalStatusCodeDocuments;
257: }
258:
259: public long getTotalHostDocuments() {
260: return totalHostDocuments;
261: }
262:
263: public long getTotalDnsHostDocuments() {
264: return totalDnsHostDocuments;
265: }
266:
267: public Hashtable getHostsDnsDistribution() {
268: return hostsDnsDistribution;
269: }
270:
271: public long getTotalHostDnsDocuments() {
272: return totalDnsHostDocuments;
273: }
274:
275: public long getTotalHostSize() {
276: return totalHostSize;
277: }
278:
279: public long getTotalDnsHostSize() {
280: return totalDnsHostSize;
281: }
282:
283: public Hashtable getTldDistribution() {
284: return tldDistribution;
285: }
286:
287: public Hashtable getTldBytes() {
288: return tldBytes;
289: }
290:
291: public long getTotalTldDocuments() {
292: return totalTldDocuments;
293: }
294:
295: public long getTotalTldSize() {
296: return totalTldSize;
297: }
298:
299: public Hashtable getTldHostDistribution() {
300: return tldHostDistribution;
301: }
302:
303: public long getTotalHosts() {
304: return totalHosts;
305: }
306:
307: public String getDurationTime() {
308: return durationTime;
309: }
310:
311: public String getProcessedDocsPerSec() {
312: return processedDocsPerSec;
313: }
314:
315: public String getBandwidthKbytesPerSec() {
316: return bandwidthKbytesPerSec;
317: }
318:
319: public String getTotalDataWritten() {
320: return totalDataWritten;
321: }
322:
323: /**
324: * Sort the entries of the given HashMap in descending order by their
325: * values, which must be longs wrapped with <code>LongWrapper</code>.
326: * <p>
327: * Elements are sorted by value from largest to smallest. Equal values are
328: * sorted in an arbitrary, but consistent manner by their keys. Only items
329: * with identical value and key are considered equal.
330: *
331: * If the passed-in map requires access to be synchronized, the caller
332: * should ensure this synchronization.
333: *
334: * @param mapOfLongWrapperValues
335: * Assumes values are wrapped with LongWrapper.
336: * @return a sorted set containing the same elements as the map.
337: */
338: public TreeMap<String, LongWrapper> getReverseSortedCopy(
339: final Map<String, LongWrapper> mapOfLongWrapperValues) {
340: TreeMap<String, LongWrapper> sortedMap = new TreeMap<String, LongWrapper>(
341: new Comparator<String>() {
342: public int compare(String e1, String e2) {
343: long firstVal = mapOfLongWrapperValues.get(e1).longValue;
344: long secondVal = mapOfLongWrapperValues.get(e2).longValue;
345: if (firstVal < secondVal) {
346: return 1;
347: }
348: if (secondVal < firstVal) {
349: return -1;
350: }
351: // If the values are the same, sort by keys.
352: return e1.compareTo(e2);
353: }
354: });
355: try {
356: sortedMap.putAll(mapOfLongWrapperValues);
357: } catch (UnsupportedOperationException e) {
358: for (String key : mapOfLongWrapperValues.keySet()) {
359: sortedMap.put(key, mapOfLongWrapperValues.get(key));
360: }
361: }
362: return sortedMap;
363: }
364:
365: /**
366: * Get the number of hosts with a particular TLD.
367: * @param tld
368: * top-level domain name
369: * @return Total crawled hosts
370: */
371: public long getHostsPerTld(String tld) {
372: LongWrapper lw = (LongWrapper) tldHostDistribution.get(tld);
373: return (lw == null ? 0 : lw.longValue);
374: }
375:
376: /**
377: * Read status code distribution from responsecode-report.txt.
378: * DNS and HTTP status codes are separated when read.
379: * @return True if we found some stats.
380: */
381: private boolean calculateStatusCodeDistribution() {
382: // Read from responsecode-report.txt
383: File f = new File(cjob.getDirectory(),
384: "responsecode-report.txt");
385: if (!f.exists()) {
386: return false;
387: }
388: BufferedReader br = null;
389: try {
390: FileReader reader = new FileReader(f);
391: br = new BufferedReader(reader);
392: String line = br.readLine(); // Ignore heading
393: line = br.readLine();
394: while (line != null) {
395: // Get status code and # urls which are seperated by a space
396:
397: String[] items = line.split(" ");
398: if (items.length < 2) {
399: logger.log(Level.WARNING,
400: "Unexpected formatting on line [" + line
401: + "]");
402: } else {
403: // See if DNS or HTTP status code
404: if (items[0].length() < 3) {
405: // DNS status code
406: long total = Long.parseLong(items[1]);
407: dnsStatusCodeDistribution.put(items[0],
408: new LongWrapper(total));
409: totalDnsStatusCodeDocuments += total;
410: } else {
411: // HTTP status code
412: long total = Long.parseLong(items[1]);
413: statusCodeDistribution.put(items[0],
414: new LongWrapper(total));
415: totalStatusCodeDocuments += total;
416: }
417: }
418: line = br.readLine();
419: }
420: } catch (IOException e) {
421: logger.log(Level.SEVERE, "Unable to read "
422: + f.getAbsolutePath(), e);
423: } finally {
424: if (br != null) {
425: try {
426: br.close();
427: } catch (IOException e) {
428: logger.log(Level.SEVERE, "Closing "
429: + f.getAbsolutePath(), e);
430: }
431: }
432: }
433: return true;
434: }
435:
436: /**
437: * Read MIME type data from mimetype-report.txt.
438: * MIME type of text/dns is separated from other MIME types.
439: * @return True if we found some stats.
440: */
441: private boolean calculateMimeTypeDistribution() {
442: File f = new File(cjob.getDirectory(), "mimetype-report.txt");
443: if (!f.exists()) {
444: return false;
445: }
446: BufferedReader br = null;
447: try {
448: FileReader reader = new FileReader(f);
449: br = new BufferedReader(reader);
450: String line = br.readLine(); // Ignore heading
451: line = br.readLine();
452: while (line != null) {
453: // Get num urls, num bytes, and MIME type (seperated by a space)
454: // Example: 12 134279 text/html
455:
456: String[] items = line.split(" ");
457: if (items.length < 3) {
458: logger.log(Level.WARNING,
459: "Unexpected formatting on line [" + line
460: + "]");
461: } else {
462: long total = Long.parseLong(items[0]);
463: long bytes = Long.parseLong(items[1]);
464: String mime = items[2];
465:
466: // Seperate DNS reconrds from HTTP
467: if (mime.equalsIgnoreCase("text/dns")) {
468: mimeTypeDnsDistribution.put(mime,
469: new LongWrapper(total));
470: mimeTypeDnsBytes.put(mime, new LongWrapper(
471: bytes));
472: totalDnsMimeTypeDocuments += total;
473: totalDnsMimeSize += bytes;
474: } else {
475: mimeTypeDistribution.put(mime, new LongWrapper(
476: total));
477: mimeTypeBytes.put(mime, new LongWrapper(bytes));
478: totalMimeTypeDocuments += total;
479: totalMimeSize += bytes;
480: }
481: }
482: line = br.readLine();
483: }
484: } catch (IOException e) {
485: logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(),
486: e);
487: } finally {
488: if (br != null) {
489: try {
490: br.close();
491: } catch (IOException e) {
492: logger.log(Level.SEVERE, "Closing "
493: + f.getAbsolutePath(), e);
494: }
495: }
496: }
497: return true;
498: }
499:
500: /**
501: * Read number of URLs and total bytes for each host name from
502: * hosts-report.txt.
503: * Host name of "dns:" is separated from others.
504: * @return true if stats found.
505: */
506: private boolean calculateHostsDistribution() {
507: File f = new File(cjob.getDirectory(), "hosts-report.txt");
508: if (!f.exists()) {
509: return false;
510: }
511: BufferedReader br = null;
512: try {
513: FileReader reader = new FileReader(f);
514: br = new BufferedReader(reader);
515: String line = br.readLine(); // Ignore heading
516: line = br.readLine();
517: while (line != null) {
518: // Get num urls, num bytes, and host name (seperated by a space)
519: // Example: 9 7468 www.blogger.com
520:
521: String[] items = line.split(" ");
522: if (items.length < 3) {
523: logger.log(Level.WARNING,
524: "Unexpected formatting on line [" + line
525: + "]");
526: } else {
527: long total = Long.parseLong(items[0]);
528: long bytes = Long.parseLong(items[1]);
529: String host = items[2];
530:
531: // Seperate DNS reconrds from HTTP
532: if (host.startsWith("dns:", 0)) {
533: hostsDnsDistribution.put(host, new LongWrapper(
534: total));
535: hostsDnsBytes.put(host, new LongWrapper(bytes));
536: totalDnsHostDocuments += total;
537: totalDnsHostSize += bytes;
538: } else {
539: hostsDistribution.put(host, new LongWrapper(
540: total));
541: hostsBytes.put(host, new LongWrapper(bytes));
542: totalHostDocuments += total;
543: totalHostSize += bytes;
544:
545: // Count top level domain (TLD)
546: String tld = host.substring(host
547: .lastIndexOf('.') + 1);
548: incrementMapCount(tldDistribution, tld, total);
549: incrementMapCount(tldBytes, tld, bytes);
550: incrementMapCount(tldHostDistribution, tld);
551: totalTldDocuments += total;
552: totalTldSize += bytes;
553:
554: totalHosts++;
555: }
556: }
557: line = br.readLine();
558: }
559: } catch (IOException e) {
560: logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(),
561: e);
562: } finally {
563: if (br != null) {
564: try {
565: br.close();
566: } catch (IOException e) {
567: logger.log(Level.SEVERE, "Closing "
568: + f.getAbsolutePath(), e);
569: }
570: }
571: }
572: return true;
573: }
574:
575: /**
576: * Returns the accumulated number of bytes downloaded from a given host.
577: * @param host name of the host
578: * @return the accumulated number of bytes downloaded from a given host
579: */
580: public long getBytesPerHost(String host) {
581: long bytes = -1;
582:
583: bytes = host != null && host.startsWith("dns:", 0) ? ((LongWrapper) hostsDnsBytes
584: .get(host)).longValue
585: : ((LongWrapper) hostsBytes.get(host)).longValue;
586:
587: return bytes;
588: }
589:
590: /**
591: * Returns the total number of bytes downloaded for a given TLD.
592: * @param tld TLD
593: * @return the total number of bytes downloaded for a given TLD
594: */
595: public long getBytesPerTld(String tld) {
596: LongWrapper lw = (LongWrapper) tldBytes.get(tld);
597: return (lw == null ? 0 : lw.longValue);
598: }
599:
600: /**
601: * Returns the accumulated number of bytes from files of a given file type.
602: * @param filetype Filetype to check.
603: * @return the accumulated number of bytes from files of a given mime type
604: */
605: public long getBytesPerMimeType(String filetype) {
606: long bytes = -1;
607:
608: if (filetype != null) {
609: if (filetype.equals("text/dns")) {
610: bytes = mimeTypeDnsBytes.get(filetype) == null ? 0
611: : ((LongWrapper) mimeTypeDnsBytes.get(filetype)).longValue;
612: } else {
613: bytes = mimeTypeBytes.get(filetype) == null ? 0
614: : ((LongWrapper) mimeTypeBytes.get(filetype)).longValue;
615: }
616: }
617: return bytes;
618: }
619:
620: /**
621: * Reads duration time, processed docs/sec, bandwidth, and total size
622: * of crawl from crawl-report.txt.
623: * @return true if stats found.
624: */
625: public boolean readCrawlReport() {
626: File f = new File(cjob.getDirectory(), "crawl-report.txt");
627: if (!f.exists()) {
628: return false;
629: }
630: BufferedReader br = null;
631: try {
632: FileReader reader = new FileReader(f);
633: br = new BufferedReader(reader);
634: String line = br.readLine();
635: while (line != null) {
636: if (line.startsWith("Duration Time")) {
637: durationTime = line
638: .substring(line.indexOf(':') + 1);
639: } else if (line.startsWith("Processed docs/sec")) {
640: processedDocsPerSec = line.substring(line
641: .indexOf(':') + 1);
642: } else if (line.startsWith("Bandwidth in Kbytes/sec")) {
643: bandwidthKbytesPerSec = line.substring(line
644: .indexOf(':') + 1);
645: } else if (line
646: .startsWith("Total Raw Data Size in Bytes")) {
647: totalDataWritten = line
648: .substring(line.indexOf(':') + 1);
649: }
650:
651: line = br.readLine();
652: }
653: } catch (IOException e) {
654: logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(),
655: e);
656: } finally {
657: if (br != null) {
658: try {
659: br.close();
660: } catch (IOException e) {
661: logger.log(Level.SEVERE, "Failed close of "
662: + f.getAbsolutePath(), e);
663: }
664: }
665: }
666: return true;
667: }
668:
669: /**
670: * Returns sorted Iterator of seeds records based on status code.
671: * @return sorted Iterator of seeds records
672: */
673: public Iterator<SeedRecord> getSeedRecordsSortedByStatusCode() {
674: TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(
675: new Comparator<SeedRecord>() {
676: public int compare(SeedRecord sr1, SeedRecord sr2) {
677: int code1 = sr1.getStatusCode();
678: int code2 = sr2.getStatusCode();
679: if (code1 == code2) {
680: // If the values are equal, sort by URIs.
681: return sr1.getUri().compareTo(sr2.getUri());
682: }
683: // mirror and shift the nubmer line so as to
684: // place zero at the beginning, then all negatives
685: // in order of ascending absolute value, then all
686: // positives descending
687: code1 = -code1 - Integer.MAX_VALUE;
688: code2 = -code2 - Integer.MAX_VALUE;
689:
690: return new Integer(code1)
691: .compareTo(new Integer(code2));
692: }
693: });
694: for (SeedRecord sr : processedSeedsRecords.values()) {
695: sortedSet.add(sr);
696: }
697:
698: return sortedSet.iterator();
699: }
700:
701: /**
702: * Reads seed data from seeds-report.txt.
703: * @return True if stats found.
704: */
705: private boolean readSeedReport() {
706: File f = new File(cjob.getDirectory(), "seeds-report.txt");
707: if (!f.exists()) {
708: return false;
709: }
710: BufferedReader br = null;
711: try {
712: FileReader reader = new FileReader(f);
713: br = new BufferedReader(reader);
714:
715: // Ignore heading: [code] [status] [seed] [redirect]
716: String line = br.readLine();
717: line = br.readLine();
718: while (line != null) {
719: // Example lines:
720: // 302 CRAWLED http://www.ashlandcitytimes.com/ http://www.ashlandcitytimes.com/apps/pbcs.dll/section?Category=MTCN01
721: // 200 CRAWLED http://noleeo.com/
722:
723: String[] items = line.split(" ");
724:
725: if (items.length < 3) {
726: logger.log(Level.WARNING,
727: "Unexpected formatting on line [" + line
728: + "]");
729: } else {
730: String statusCode = items[0];
731: String crawlStatus = items[1];
732: String seed = items[2];
733: String redirect = items.length > 3 ? items[3]
734: : null;
735:
736: // All values should be CRAWLED or NOTCRAWLED
737: if (crawlStatus.equals("CRAWLED")) {
738: crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS;
739: } else {
740: crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE;
741: }
742: SeedRecord sr = new SeedRecord(seed, crawlStatus,
743: Integer.parseInt(statusCode), redirect);
744: processedSeedsRecords.put(seed, sr);
745: }
746:
747: line = br.readLine();
748: }
749: } catch (IOException e) {
750: logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(),
751: e);
752: } finally {
753: if (br != null) {
754: try {
755: br.close();
756: } catch (IOException e) {
757: logger.log(Level.SEVERE, "Closing "
758: + f.getAbsolutePath(), e);
759: }
760: }
761: }
762: return true;
763: }
764:
765: /**
766: * Return a copy of the hosts distribution in reverse-sorted
767: * (largest first) order.
768: *
769: * @return SortedMap of hosts distribution
770: */
771: public SortedMap getReverseSortedHostsDistribution() {
772: return getReverseSortedCopy(hostsDistribution);
773: }
774:
775: /**
776: * @return True if we compiled stats, false if none to compile (e.g.
777: * there are no reports files on disk).
778: */
779: public boolean isStats() {
780: return this.stats;
781: }
782: }
|