Source Code Cross Referenced for StatisticsSummary.java in  » Web-Crawler » heritrix » org » archive » crawler » admin » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.admin 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* StatisticsSummary
002:         * 
003:         * $Id: StatisticsSummary.java 4666 2006-09-26 17:53:28Z paul_jack $$
004:         * 
005:         * Created on July 27, 2006
006:         * 
007:         * Copyright (C) 2006 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.crawler.admin;
026:
027:        import java.io.File;
028:        import java.io.FileReader;
029:        import java.io.BufferedReader;
030:        import java.io.IOException;
031:        import java.util.Comparator;
032:        import java.util.Hashtable;
033:        import java.util.Iterator;
034:        import java.util.Map;
035:        import java.util.SortedMap;
036:        import java.util.TreeMap;
037:        import java.util.TreeSet;
038:        import java.util.logging.Level;
039:        import java.util.logging.Logger;
040:
041:        import org.archive.util.LongWrapper;
042:
043:        /**
044:         * This class provides descriptive statistics of a finished crawl job by
045:         * using the crawl report files generated by StatisticsTracker.  Any formatting
046:         * changes to the way StatisticsTracker writes to the summary crawl reports will
047:         * require changes to this class.
048:         * <p>
049:         * The following statistics are accessible from this class:
050:         * <ul>
051:         *   <li> Successfully downloaded documents per fetch status code
052:         *   <li> Successfully downloaded documents per document mime type
053:         *   <li> Amount of data per mime type
054:         *   <li> Successfully downloaded documents per host
055:         *   <li> Amount of data per host
056:         *   <li> Successfully downloaded documents per top-level domain name (TLD)
057:         *   <li> Disposition of all seeds 
058:         *   <li> Successfully downloaded documents per host per source
059:         * </ul>
060:         *
061:         * <p>TODO: Make it so summarizing is not done all in RAM so we avoid
062:         * OOME.
063:         *
064:         * @author Frank McCown
065:         *
066:         * @see org.archive.crawler.admin.StatisticsTracker
067:         */
068:        public class StatisticsSummary {
069:            /**
070:             * Messages from the StatisticsSummary.
071:             */
072:            private final static Logger logger = Logger
073:                    .getLogger(StatisticsSummary.class.getName());
074:
075:            private boolean stats = true;
076:
077:            /** Crawl job whose summary we want to view */
078:            private CrawlJob cjob;
079:
080:            protected long totalDnsStatusCodeDocuments = 0;
081:            protected long totalStatusCodeDocuments = 0;
082:            protected long totalFileTypeDocuments = 0;
083:            protected long totalMimeTypeDocuments = 0;
084:            protected long totalDnsMimeTypeDocuments = 0;
085:            protected long totalDnsHostDocuments = 0;
086:            protected long totalHostDocuments = 0;
087:            protected long totalMimeSize = 0;
088:            protected long totalDnsMimeSize = 0;
089:            protected long totalHostSize = 0;
090:            protected long totalDnsHostSize = 0;
091:            protected long totalTldDocuments = 0;
092:            protected long totalTldSize = 0;
093:            protected long totalHosts = 0;
094:
095:            protected String durationTime;
096:            protected String processedDocsPerSec;
097:            protected String bandwidthKbytesPerSec;
098:            protected String totalDataWritten;
099:
100:            /** Keep track of the file types we see (mime type -> count) */
101:            protected Hashtable<String, LongWrapper> mimeTypeDistribution = new Hashtable<String, LongWrapper>();
102:            protected Hashtable<String, LongWrapper> mimeTypeBytes = new Hashtable<String, LongWrapper>();
103:            protected Hashtable<String, LongWrapper> mimeTypeDnsDistribution = new Hashtable<String, LongWrapper>();
104:            protected Hashtable<String, LongWrapper> mimeTypeDnsBytes = new Hashtable<String, LongWrapper>();
105:
106:            /** Keep track of status codes */
107:            protected Hashtable<String, LongWrapper> statusCodeDistribution = new Hashtable<String, LongWrapper>();
108:            protected Hashtable<String, LongWrapper> dnsStatusCodeDistribution = new Hashtable<String, LongWrapper>();
109:
110:            /** Keep track of hosts */
111:            protected Hashtable<String, LongWrapper> hostsDistribution = new Hashtable<String, LongWrapper>();
112:            protected Hashtable<String, LongWrapper> hostsBytes = new Hashtable<String, LongWrapper>();
113:            protected Hashtable<String, LongWrapper> hostsDnsDistribution = new Hashtable<String, LongWrapper>();
114:            protected Hashtable<String, LongWrapper> hostsDnsBytes = new Hashtable<String, LongWrapper>();
115:
116:            /** Keep track of TLDs */
117:            protected Hashtable<String, LongWrapper> tldDistribution = new Hashtable<String, LongWrapper>();
118:            protected Hashtable<String, LongWrapper> tldBytes = new Hashtable<String, LongWrapper>();
119:            protected Hashtable<String, LongWrapper> tldHostDistribution = new Hashtable<String, LongWrapper>();
120:
121:            /** Keep track of processed seeds */
122:            protected transient Map<String, SeedRecord> processedSeedsRecords = new Hashtable<String, SeedRecord>();
123:
124:            /**
125:             * Constructor
126:             * 
127:             * @param cjob
128:             * 				Completed crawl job
129:             */
130:            public StatisticsSummary(CrawlJob cjob) {
131:                this .cjob = cjob;
132:
133:                // Read all stats for this crawl job
134:                this .stats = calculateStatusCodeDistribution();
135:                if (calculateMimeTypeDistribution()) {
136:                    this .stats = true;
137:                }
138:                if (calculateHostsDistribution()) {
139:                    this .stats = true;
140:                }
141:                if (readCrawlReport()) {
142:                    this .stats = true;
143:                }
144:                if (readSeedReport()) {
145:                    this .stats = true;
146:                }
147:            }
148:
149:            /**
150:             * Increment a counter for a key in a given HashMap. Used for various
151:             * aggregate data.
152:             *
153:             * @param map The HashMap
154:             * @param key The key for the counter to be incremented, if it does not
155:             *               exist it will be added (set to 1).  If null it will
156:             *            increment the counter "unknown".
157:             */
158:            protected static void incrementMapCount(
159:                    Map<String, LongWrapper> map, String key) {
160:                incrementMapCount(map, key, 1);
161:            }
162:
163:            /**
164:             * Increment a counter for a key in a given HashMap by an arbitrary amount.
165:             * Used for various aggregate data. The increment amount can be negative.
166:             *
167:             * @param map
168:             *            The HashMap
169:             * @param key
170:             *            The key for the counter to be incremented, if it does not
171:             *            exist it will be added (set to equal to
172:             *            <code>increment</code>).
173:             *            If null it will increment the counter "unknown".
174:             * @param increment
175:             *            The amount to increment counter related to the
176:             *            <code>key</code>.
177:             */
178:            protected static void incrementMapCount(
179:                    Map<String, LongWrapper> map, String key, long increment) {
180:                if (key == null) {
181:                    key = "unknown";
182:                }
183:                LongWrapper lw = map.get(key);
184:                if (lw == null) {
185:                    map.put(key, new LongWrapper(increment));
186:                } else {
187:                    lw.longValue += increment;
188:                }
189:            }
190:
191:            /** Returns a HashMap that contains information about distributions of
192:             *  encountered mime types.  Key/value pairs represent
193:             *  mime type -> count.
194:             * <p>
195:             * <b>Note:</b> All the values are wrapped with a
196:             * {@link LongWrapper LongWrapper}
197:             * @return mimeTypeDistribution
198:             */
199:            public Hashtable getMimeDistribution() {
200:                return mimeTypeDistribution;
201:            }
202:
203:            public long getTotalMimeTypeDocuments() {
204:                return totalMimeTypeDocuments;
205:            }
206:
207:            public long getTotalDnsMimeTypeDocuments() {
208:                return totalDnsMimeTypeDocuments;
209:            }
210:
211:            public long getTotalMimeSize() {
212:                return totalMimeSize;
213:            }
214:
215:            public long getTotalDnsMimeSize() {
216:                return totalDnsMimeSize;
217:            }
218:
219:            /**
220:             * Return a HashMap representing the distribution of HTTP status codes for
221:             * successfully fetched curis, as represented by a hashmap where key -&gt;
222:             * val represents (string)code -&gt; (integer)count.
223:             * 
224:             * <b>Note: </b> All the values are wrapped with a
225:             * {@link LongWrapper LongWrapper}
226:             * 
227:             * @return statusCodeDistribution
228:             */
229:            public Hashtable getStatusCodeDistribution() {
230:                return statusCodeDistribution;
231:            }
232:
233:            /**
234:             * Return a HashMap representing the distribution of DNS status codes for
235:             * successfully fetched curis, as represented by a hashmap where key -&gt;
236:             * val represents (string)code -&gt; (integer)count.
237:             * 
238:             * <b>Note: </b> All the values are wrapped with a
239:             * {@link LongWrapper LongWrapper}
240:             * 
241:             * @return dnsStatusCodeDistribution
242:             */
243:            public Hashtable getDnsStatusCodeDistribution() {
244:                return dnsStatusCodeDistribution;
245:            }
246:
247:            public Hashtable getDnsMimeDistribution() {
248:                return mimeTypeDnsDistribution;
249:            }
250:
251:            public long getTotalDnsStatusCodeDocuments() {
252:                return totalDnsStatusCodeDocuments;
253:            }
254:
255:            public long getTotalStatusCodeDocuments() {
256:                return totalStatusCodeDocuments;
257:            }
258:
259:            public long getTotalHostDocuments() {
260:                return totalHostDocuments;
261:            }
262:
263:            public long getTotalDnsHostDocuments() {
264:                return totalDnsHostDocuments;
265:            }
266:
267:            public Hashtable getHostsDnsDistribution() {
268:                return hostsDnsDistribution;
269:            }
270:
271:            public long getTotalHostDnsDocuments() {
272:                return totalDnsHostDocuments;
273:            }
274:
275:            public long getTotalHostSize() {
276:                return totalHostSize;
277:            }
278:
279:            public long getTotalDnsHostSize() {
280:                return totalDnsHostSize;
281:            }
282:
283:            public Hashtable getTldDistribution() {
284:                return tldDistribution;
285:            }
286:
287:            public Hashtable getTldBytes() {
288:                return tldBytes;
289:            }
290:
291:            public long getTotalTldDocuments() {
292:                return totalTldDocuments;
293:            }
294:
295:            public long getTotalTldSize() {
296:                return totalTldSize;
297:            }
298:
299:            public Hashtable getTldHostDistribution() {
300:                return tldHostDistribution;
301:            }
302:
303:            public long getTotalHosts() {
304:                return totalHosts;
305:            }
306:
307:            public String getDurationTime() {
308:                return durationTime;
309:            }
310:
311:            public String getProcessedDocsPerSec() {
312:                return processedDocsPerSec;
313:            }
314:
315:            public String getBandwidthKbytesPerSec() {
316:                return bandwidthKbytesPerSec;
317:            }
318:
319:            public String getTotalDataWritten() {
320:                return totalDataWritten;
321:            }
322:
323:            /**
324:             * Sort the entries of the given HashMap in descending order by their
325:             * values, which must be longs wrapped with <code>LongWrapper</code>.
326:             * <p>
327:             * Elements are sorted by value from largest to smallest. Equal values are
328:             * sorted in an arbitrary, but consistent manner by their keys. Only items
329:             * with identical value and key are considered equal.
330:             *
331:             * If the passed-in map requires access to be synchronized, the caller
332:             * should ensure this synchronization. 
333:             * 
334:             * @param mapOfLongWrapperValues
335:             *            Assumes values are wrapped with LongWrapper.
336:             * @return a sorted set containing the same elements as the map.
337:             */
338:            public TreeMap<String, LongWrapper> getReverseSortedCopy(
339:                    final Map<String, LongWrapper> mapOfLongWrapperValues) {
340:                TreeMap<String, LongWrapper> sortedMap = new TreeMap<String, LongWrapper>(
341:                        new Comparator<String>() {
342:                            public int compare(String e1, String e2) {
343:                                long firstVal = mapOfLongWrapperValues.get(e1).longValue;
344:                                long secondVal = mapOfLongWrapperValues.get(e2).longValue;
345:                                if (firstVal < secondVal) {
346:                                    return 1;
347:                                }
348:                                if (secondVal < firstVal) {
349:                                    return -1;
350:                                }
351:                                // If the values are the same, sort by keys.
352:                                return e1.compareTo(e2);
353:                            }
354:                        });
355:                try {
356:                    sortedMap.putAll(mapOfLongWrapperValues);
357:                } catch (UnsupportedOperationException e) {
358:                    for (String key : mapOfLongWrapperValues.keySet()) {
359:                        sortedMap.put(key, mapOfLongWrapperValues.get(key));
360:                    }
361:                }
362:                return sortedMap;
363:            }
364:
365:            /**
366:             * Get the number of hosts with a particular TLD.
367:             * @param tld
368:             * 				top-level domain name
369:             * @return		Total crawled hosts
370:             */
371:            public long getHostsPerTld(String tld) {
372:                LongWrapper lw = (LongWrapper) tldHostDistribution.get(tld);
373:                return (lw == null ? 0 : lw.longValue);
374:            }
375:
376:            /**
377:             * Read status code distribution from responsecode-report.txt.
378:             * DNS and HTTP status codes are separated when read.
379:             * @return True if we found some stats.
380:             */
381:            private boolean calculateStatusCodeDistribution() {
382:                // Read from responsecode-report.txt
383:                File f = new File(cjob.getDirectory(),
384:                        "responsecode-report.txt");
385:                if (!f.exists()) {
386:                    return false;
387:                }
388:                BufferedReader br = null;
389:                try {
390:                    FileReader reader = new FileReader(f);
391:                    br = new BufferedReader(reader);
392:                    String line = br.readLine(); // Ignore heading
393:                    line = br.readLine();
394:                    while (line != null) {
395:                        // Get status code and # urls which are seperated by a space
396:
397:                        String[] items = line.split(" ");
398:                        if (items.length < 2) {
399:                            logger.log(Level.WARNING,
400:                                    "Unexpected formatting on line [" + line
401:                                            + "]");
402:                        } else {
403:                            // See if DNS or HTTP status code
404:                            if (items[0].length() < 3) {
405:                                // DNS status code
406:                                long total = Long.parseLong(items[1]);
407:                                dnsStatusCodeDistribution.put(items[0],
408:                                        new LongWrapper(total));
409:                                totalDnsStatusCodeDocuments += total;
410:                            } else {
411:                                // HTTP status code
412:                                long total = Long.parseLong(items[1]);
413:                                statusCodeDistribution.put(items[0],
414:                                        new LongWrapper(total));
415:                                totalStatusCodeDocuments += total;
416:                            }
417:                        }
418:                        line = br.readLine();
419:                    }
420:                } catch (IOException e) {
421:                    logger.log(Level.SEVERE, "Unable to read "
422:                            + f.getAbsolutePath(), e);
423:                } finally {
424:                    if (br != null) {
425:                        try {
426:                            br.close();
427:                        } catch (IOException e) {
428:                            logger.log(Level.SEVERE, "Closing "
429:                                    + f.getAbsolutePath(), e);
430:                        }
431:                    }
432:                }
433:                return true;
434:            }
435:
436:            /**
437:             * Read MIME type data from mimetype-report.txt.
438:             * MIME type of text/dns is separated from other MIME types.
439:             * @return True if we found some stats.
440:             */
441:            private boolean calculateMimeTypeDistribution() {
442:                File f = new File(cjob.getDirectory(), "mimetype-report.txt");
443:                if (!f.exists()) {
444:                    return false;
445:                }
446:                BufferedReader br = null;
447:                try {
448:                    FileReader reader = new FileReader(f);
449:                    br = new BufferedReader(reader);
450:                    String line = br.readLine(); // Ignore heading
451:                    line = br.readLine();
452:                    while (line != null) {
453:                        // Get num urls, num bytes, and MIME type (seperated by a space)
454:                        // Example: 12 134279 text/html
455:
456:                        String[] items = line.split(" ");
457:                        if (items.length < 3) {
458:                            logger.log(Level.WARNING,
459:                                    "Unexpected formatting on line [" + line
460:                                            + "]");
461:                        } else {
462:                            long total = Long.parseLong(items[0]);
463:                            long bytes = Long.parseLong(items[1]);
464:                            String mime = items[2];
465:
466:                            // Seperate DNS reconrds from HTTP
467:                            if (mime.equalsIgnoreCase("text/dns")) {
468:                                mimeTypeDnsDistribution.put(mime,
469:                                        new LongWrapper(total));
470:                                mimeTypeDnsBytes.put(mime, new LongWrapper(
471:                                        bytes));
472:                                totalDnsMimeTypeDocuments += total;
473:                                totalDnsMimeSize += bytes;
474:                            } else {
475:                                mimeTypeDistribution.put(mime, new LongWrapper(
476:                                        total));
477:                                mimeTypeBytes.put(mime, new LongWrapper(bytes));
478:                                totalMimeTypeDocuments += total;
479:                                totalMimeSize += bytes;
480:                            }
481:                        }
482:                        line = br.readLine();
483:                    }
484:                } catch (IOException e) {
485:                    logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(),
486:                            e);
487:                } finally {
488:                    if (br != null) {
489:                        try {
490:                            br.close();
491:                        } catch (IOException e) {
492:                            logger.log(Level.SEVERE, "Closing "
493:                                    + f.getAbsolutePath(), e);
494:                        }
495:                    }
496:                }
497:                return true;
498:            }
499:
500:            /**
501:             * Read number of URLs and total bytes for each host name from
502:             * hosts-report.txt.
503:             * Host name of "dns:" is separated from others.
504:             * @return true if stats found.
505:             */
506:            private boolean calculateHostsDistribution() {
507:                File f = new File(cjob.getDirectory(), "hosts-report.txt");
508:                if (!f.exists()) {
509:                    return false;
510:                }
511:                BufferedReader br = null;
512:                try {
513:                    FileReader reader = new FileReader(f);
514:                    br = new BufferedReader(reader);
515:                    String line = br.readLine(); // Ignore heading
516:                    line = br.readLine();
517:                    while (line != null) {
518:                        // Get num urls, num bytes, and host name (seperated by a space)
519:                        // Example: 9 7468 www.blogger.com
520:
521:                        String[] items = line.split(" ");
522:                        if (items.length < 3) {
523:                            logger.log(Level.WARNING,
524:                                    "Unexpected formatting on line [" + line
525:                                            + "]");
526:                        } else {
527:                            long total = Long.parseLong(items[0]);
528:                            long bytes = Long.parseLong(items[1]);
529:                            String host = items[2];
530:
531:                            // Seperate DNS reconrds from HTTP
532:                            if (host.startsWith("dns:", 0)) {
533:                                hostsDnsDistribution.put(host, new LongWrapper(
534:                                        total));
535:                                hostsDnsBytes.put(host, new LongWrapper(bytes));
536:                                totalDnsHostDocuments += total;
537:                                totalDnsHostSize += bytes;
538:                            } else {
539:                                hostsDistribution.put(host, new LongWrapper(
540:                                        total));
541:                                hostsBytes.put(host, new LongWrapper(bytes));
542:                                totalHostDocuments += total;
543:                                totalHostSize += bytes;
544:
545:                                // Count top level domain (TLD)
546:                                String tld = host.substring(host
547:                                        .lastIndexOf('.') + 1);
548:                                incrementMapCount(tldDistribution, tld, total);
549:                                incrementMapCount(tldBytes, tld, bytes);
550:                                incrementMapCount(tldHostDistribution, tld);
551:                                totalTldDocuments += total;
552:                                totalTldSize += bytes;
553:
554:                                totalHosts++;
555:                            }
556:                        }
557:                        line = br.readLine();
558:                    }
559:                } catch (IOException e) {
560:                    logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(),
561:                            e);
562:                } finally {
563:                    if (br != null) {
564:                        try {
565:                            br.close();
566:                        } catch (IOException e) {
567:                            logger.log(Level.SEVERE, "Closing "
568:                                    + f.getAbsolutePath(), e);
569:                        }
570:                    }
571:                }
572:                return true;
573:            }
574:
575:            /**
576:             * Returns the accumulated number of bytes downloaded from a given host.
577:             * @param host name of the host
578:             * @return the accumulated number of bytes downloaded from a given host
579:             */
580:            public long getBytesPerHost(String host) {
581:                long bytes = -1;
582:
583:                bytes = host != null && host.startsWith("dns:", 0) ? ((LongWrapper) hostsDnsBytes
584:                        .get(host)).longValue
585:                        : ((LongWrapper) hostsBytes.get(host)).longValue;
586:
587:                return bytes;
588:            }
589:
590:            /**
591:             * Returns the total number of bytes downloaded for a given TLD.
592:             * @param tld TLD
593:             * @return the total number of bytes downloaded for a given TLD
594:             */
595:            public long getBytesPerTld(String tld) {
596:                LongWrapper lw = (LongWrapper) tldBytes.get(tld);
597:                return (lw == null ? 0 : lw.longValue);
598:            }
599:
600:            /**
601:             * Returns the accumulated number of bytes from files of a given file type.
602:             * @param filetype Filetype to check.
603:             * @return the accumulated number of bytes from files of a given mime type
604:             */
605:            public long getBytesPerMimeType(String filetype) {
606:                long bytes = -1;
607:
608:                if (filetype != null) {
609:                    if (filetype.equals("text/dns")) {
610:                        bytes = mimeTypeDnsBytes.get(filetype) == null ? 0
611:                                : ((LongWrapper) mimeTypeDnsBytes.get(filetype)).longValue;
612:                    } else {
613:                        bytes = mimeTypeBytes.get(filetype) == null ? 0
614:                                : ((LongWrapper) mimeTypeBytes.get(filetype)).longValue;
615:                    }
616:                }
617:                return bytes;
618:            }
619:
620:            /**
621:             * Reads duration time, processed docs/sec, bandwidth, and total size
622:             * of crawl from crawl-report.txt.
623:             * @return true if stats found.
624:             */
625:            public boolean readCrawlReport() {
626:                File f = new File(cjob.getDirectory(), "crawl-report.txt");
627:                if (!f.exists()) {
628:                    return false;
629:                }
630:                BufferedReader br = null;
631:                try {
632:                    FileReader reader = new FileReader(f);
633:                    br = new BufferedReader(reader);
634:                    String line = br.readLine();
635:                    while (line != null) {
636:                        if (line.startsWith("Duration Time")) {
637:                            durationTime = line
638:                                    .substring(line.indexOf(':') + 1);
639:                        } else if (line.startsWith("Processed docs/sec")) {
640:                            processedDocsPerSec = line.substring(line
641:                                    .indexOf(':') + 1);
642:                        } else if (line.startsWith("Bandwidth in Kbytes/sec")) {
643:                            bandwidthKbytesPerSec = line.substring(line
644:                                    .indexOf(':') + 1);
645:                        } else if (line
646:                                .startsWith("Total Raw Data Size in Bytes")) {
647:                            totalDataWritten = line
648:                                    .substring(line.indexOf(':') + 1);
649:                        }
650:
651:                        line = br.readLine();
652:                    }
653:                } catch (IOException e) {
654:                    logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(),
655:                            e);
656:                } finally {
657:                    if (br != null) {
658:                        try {
659:                            br.close();
660:                        } catch (IOException e) {
661:                            logger.log(Level.SEVERE, "Failed close of "
662:                                    + f.getAbsolutePath(), e);
663:                        }
664:                    }
665:                }
666:                return true;
667:            }
668:
669:            /**
670:             * Returns sorted Iterator of seeds records based on status code.
671:             * @return sorted Iterator of seeds records
672:             */
673:            public Iterator<SeedRecord> getSeedRecordsSortedByStatusCode() {
674:                TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(
675:                        new Comparator<SeedRecord>() {
676:                            public int compare(SeedRecord sr1, SeedRecord sr2) {
677:                                int code1 = sr1.getStatusCode();
678:                                int code2 = sr2.getStatusCode();
679:                                if (code1 == code2) {
680:                                    // If the values are equal, sort by URIs.
681:                                    return sr1.getUri().compareTo(sr2.getUri());
682:                                }
683:                                // mirror and shift the nubmer line so as to
684:                                // place zero at the beginning, then all negatives 
685:                                // in order of ascending absolute value, then all 
686:                                // positives descending
687:                                code1 = -code1 - Integer.MAX_VALUE;
688:                                code2 = -code2 - Integer.MAX_VALUE;
689:
690:                                return new Integer(code1)
691:                                        .compareTo(new Integer(code2));
692:                            }
693:                        });
694:                for (SeedRecord sr : processedSeedsRecords.values()) {
695:                    sortedSet.add(sr);
696:                }
697:
698:                return sortedSet.iterator();
699:            }
700:
701:            /**
702:             * Reads seed data from seeds-report.txt.
703:             * @return True if stats found.
704:             */
705:            private boolean readSeedReport() {
706:                File f = new File(cjob.getDirectory(), "seeds-report.txt");
707:                if (!f.exists()) {
708:                    return false;
709:                }
710:                BufferedReader br = null;
711:                try {
712:                    FileReader reader = new FileReader(f);
713:                    br = new BufferedReader(reader);
714:
715:                    // Ignore heading: [code] [status] [seed] [redirect]
716:                    String line = br.readLine();
717:                    line = br.readLine();
718:                    while (line != null) {
719:                        // Example lines:
720:                        // 302 CRAWLED http://www.ashlandcitytimes.com/ http://www.ashlandcitytimes.com/apps/pbcs.dll/section?Category=MTCN01
721:                        // 200 CRAWLED http://noleeo.com/
722:
723:                        String[] items = line.split(" ");
724:
725:                        if (items.length < 3) {
726:                            logger.log(Level.WARNING,
727:                                    "Unexpected formatting on line [" + line
728:                                            + "]");
729:                        } else {
730:                            String statusCode = items[0];
731:                            String crawlStatus = items[1];
732:                            String seed = items[2];
733:                            String redirect = items.length > 3 ? items[3]
734:                                    : null;
735:
736:                            // All values should be CRAWLED or NOTCRAWLED
737:                            if (crawlStatus.equals("CRAWLED")) {
738:                                crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS;
739:                            } else {
740:                                crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE;
741:                            }
742:                            SeedRecord sr = new SeedRecord(seed, crawlStatus,
743:                                    Integer.parseInt(statusCode), redirect);
744:                            processedSeedsRecords.put(seed, sr);
745:                        }
746:
747:                        line = br.readLine();
748:                    }
749:                } catch (IOException e) {
750:                    logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(),
751:                            e);
752:                } finally {
753:                    if (br != null) {
754:                        try {
755:                            br.close();
756:                        } catch (IOException e) {
757:                            logger.log(Level.SEVERE, "Closing "
758:                                    + f.getAbsolutePath(), e);
759:                        }
760:                    }
761:                }
762:                return true;
763:            }
764:
765:            /**
766:             * Return a copy of the hosts distribution in reverse-sorted
767:             * (largest first) order.
768:             *  
769:             * @return SortedMap of hosts distribution
770:             */
771:            public SortedMap getReverseSortedHostsDistribution() {
772:                return getReverseSortedCopy(hostsDistribution);
773:            }
774:
775:            /**
776:             * @return True if we compiled stats, false if none to compile (e.g.
777:             * there are no reports files on disk).
778:             */
779:            public boolean isStats() {
780:                return this.stats;
781:            }
782:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.