Source Code Cross Referenced for StatisticsTracker.java in » Web-Crawler » heritrix » org » archive » crawler » admin » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.admin
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /* Copyright (C) 2003 Internet Archive.
0002:         *
0003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
0004:         *
0005:         * Heritrix is free software; you can redistribute it and/or modify
0006:         * it under the terms of the GNU Lesser Public License as published by
0007:         * the Free Software Foundation; either version 2.1 of the License, or
0008:         * any later version.
0009:         *
0010:         * Heritrix is distributed in the hope that it will be useful,
0011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
0012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0013:         * GNU Lesser Public License for more details.
0014:         *
0015:         * You should have received a copy of the GNU Lesser Public License
0016:         * along with Heritrix; if not, write to the Free Software
0017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0018:         *
0019:         * Created on Jul 16, 2003
0020:         *
0021:         */
0022:        package org.archive.crawler.admin;
0023:
0024:        import java.io.File;
0025:        import java.io.FileWriter;
0026:        import java.io.IOException;
0027:        import java.io.PrintWriter;
0028:        import java.io.Serializable;
0029:        import java.util.Comparator;
0030:        import java.util.Date;
0031:        import java.util.EventObject;
0032:        import java.util.Hashtable;
0033:        import java.util.Iterator;
0034:        import java.util.List;
0035:        import java.util.Map;
0036:        import java.util.HashMap;
0037:        import java.util.SortedMap;
0038:        import java.util.TreeMap;
0039:        import java.util.TreeSet;
0040:        import java.util.Vector;
0041:        import java.util.logging.Level;
0042:        import java.util.logging.Logger;
0043:
0044:        import org.archive.crawler.datamodel.CrawlURI;
0045:        import org.archive.crawler.event.CrawlURIDispositionListener;
0046:        import org.archive.crawler.framework.AbstractTracker;
0047:        import org.archive.crawler.framework.CrawlController;
0048:        import org.archive.crawler.framework.exceptions.FatalConfigurationException;
0049:        import org.archive.crawler.util.CrawledBytesHistotable;
0050:        import org.archive.net.UURI;
0051:        import org.archive.util.ArchiveUtils;
0052:        import org.archive.util.Histotable;
0053:        import org.archive.util.LongWrapper;
0054:        import org.archive.util.MimetypeUtils;
0055:        import org.archive.util.PaddingStringBuffer;
0056:
0057:        /**
0058:         * This is an implementation of the AbstractTracker. It is designed to function
0059:         * with the WUI as well as performing various logging activity.
0060:         * <p>
0061:         * At the end of each snapshot a line is written to the
0062:         * 'progress-statistics.log' file.
0063:         * <p>
0064:         * The header of that file is as follows:
0065:         * <pre> [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
0066:         * First there is a <b>timestamp</b>, accurate down to 1 second.
0067:         * <p>
0068:         * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
0069:         * are (respectively) the discovered URI count, pending URI count, successfully
0070:         * fetched count and failed fetch count from the frontier at the time of the
0071:         * snapshot.
0072:         * <p>
0073:         * <b>KB/s(avg)</b> is the bandwidth usage.  We use the total bytes downloaded
0074:         * to calculate average bandwidth usage (KB/sec). Since we also note the value
0075:         * each time a snapshot is made we can calculate the average bandwidth usage
0076:         * during the last snapshot period to gain a "current" rate. The first number is
0077:         * the current and the average is in parenthesis.
0078:         * <p>
0079:         * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
0080:         * documents (URIs) rather then KB downloaded.
0081:         * <p>
0082:         * <b>busy-threads</b> is the total number of ToeThreads that are not available
0083:         * (and thus presumably busy processing a URI). This information is extracted
0084:         * from the crawl controller.
0085:         * <p>
0086:         * Finally mem-use-KB is extracted from the run time environment
0087:         * (<code>Runtime.getRuntime().totalMemory()</code>).
0088:         * <p>
0089:         * In addition to the data collected for the above logs, various other data
0090:         * is gathered and stored by this tracker.
0091:         * <ul>
0092:         *   <li> Successfully downloaded documents per fetch status code
0093:         *   <li> Successfully downloaded documents per document mime type
0094:         *   <li> Amount of data per mime type
0095:         *   <li> Successfully downloaded documents per host
0096:         *   <li> Amount of data per host
0097:         *   <li> Disposition of all seeds (this is written to 'reports.log' at end of
0098:         *        crawl)
0099:         *   <li> Successfully downloaded documents per host per source
0100:         * </ul>
0101:         *
0102:         * @author Parker Thompson
0103:         * @author Kristinn Sigurdsson
0104:         *
0105:         * @see org.archive.crawler.framework.StatisticsTracking
0106:         * @see org.archive.crawler.framework.AbstractTracker
0107:         */
0108:        public class StatisticsTracker extends AbstractTracker implements 
0109:                CrawlURIDispositionListener, Serializable {
0110:            private static final long serialVersionUID = 8004878315916392305L;
0111:
0112:            /**
0113:             * Messages from the StatisticsTracker.
0114:             */
0115:            private final static Logger logger = Logger
0116:                    .getLogger(StatisticsTracker.class.getName());
0117:
0118:            // TODO: Need to be able to specify file where the object will be
0119:            // written once the CrawlEnded event occurs
0120:
0121:            protected long lastPagesFetchedCount = 0;
0122:            protected long lastProcessedBytesCount = 0;
0123:
0124:            /*
0125:             * Snapshot data.
0126:             */
0127:            protected long discoveredUriCount = 0;
0128:            protected long queuedUriCount = 0;
0129:            protected long finishedUriCount = 0;
0130:
0131:            protected long downloadedUriCount = 0;
0132:            protected long downloadFailures = 0;
0133:            protected long downloadDisregards = 0;
0134:            protected double docsPerSecond = 0;
0135:            protected double currentDocsPerSecond = 0;
0136:            protected int currentKBPerSec = 0;
0137:            protected long totalKBPerSec = 0;
0138:            protected int busyThreads = 0;
0139:            protected long totalProcessedBytes = 0;
0140:            protected float congestionRatio = 0;
0141:            protected long deepestUri;
0142:            protected long averageDepth;
0143:
0144:            /*
0145:             * Cumulative data
0146:             */
0147:            /** tally sizes novel, verified (same hash), vouched (not-modified) */
0148:            protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();
0149:
0150:            /** Keep track of the file types we see (mime type -> count) */
0151:            protected Hashtable<String, LongWrapper> mimeTypeDistribution = new Hashtable<String, LongWrapper>();
0152:            protected Hashtable<String, LongWrapper> mimeTypeBytes = new Hashtable<String, LongWrapper>();
0153:
0154:            /** Keep track of fetch status codes */
0155:            protected Hashtable<String, LongWrapper> statusCodeDistribution = new Hashtable<String, LongWrapper>();
0156:
0157:            /** Keep track of hosts. 
0158:             * 
0159:             * Each of these Maps are individually unsynchronized, and cannot 
0160:             * be trivially synchronized with the Collections wrapper. Thus
0161:             * their synchronized access is enforced by this class.
0162:             * 
0163:             * <p>They're transient because usually bigmaps that get reconstituted
0164:             * on recover from checkpoint.
0165:             */
0166:            protected transient Map<String, LongWrapper> hostsDistribution = null;
0167:            protected transient Map<String, LongWrapper> hostsBytes = null;
0168:            protected transient Map<String, Long> hostsLastFinished = null;
0169:
0170:            /** Keep track of URL counts per host per seed */
0171:            protected transient Map<String, HashMap<String, LongWrapper>> sourceHostDistribution = null;
0172:
0173:            /**
0174:             * Record of seeds' latest actions.
0175:             */
0176:            protected transient Map<String, SeedRecord> processedSeedsRecords;
0177:
0178:            // seeds tallies: ONLY UPDATED WHEN SEED REPORT WRITTEN
0179:            private int seedsCrawled;
0180:            private int seedsNotCrawled;
0181:            // sExitMessage: only set at crawl-end
0182:            private String sExitMessage = "Before crawl end";
0183:
0184:            public StatisticsTracker(String name) {
0185:                super (
0186:                        name,
0187:                        "A statistics tracker thats integrated into "
0188:                                + "the web UI and that creates the progress-statistics log.");
0189:            }
0190:
0191:            public void initialize(CrawlController c)
0192:                    throws FatalConfigurationException {
0193:                super .initialize(c);
0194:                try {
0195:                    this .sourceHostDistribution = c.getBigMap(
0196:                            "sourceHostDistribution", String.class,
0197:                            HashMap.class);
0198:                    this .hostsDistribution = c.getBigMap("hostsDistribution",
0199:                            String.class, LongWrapper.class);
0200:                    this .hostsBytes = c.getBigMap("hostsBytes", String.class,
0201:                            LongWrapper.class);
0202:                    this .hostsLastFinished = c.getBigMap("hostsLastFinished",
0203:                            String.class, Long.class);
0204:                    this .processedSeedsRecords = c.getBigMap(
0205:                            "processedSeedsRecords", String.class,
0206:                            SeedRecord.class);
0207:                } catch (Exception e) {
0208:                    throw new FatalConfigurationException("Failed setup of"
0209:                            + " StatisticsTracker: " + e);
0210:                }
0211:                controller.addCrawlURIDispositionListener(this );
0212:            }
0213:
0214:            protected void finalCleanup() {
0215:                super .finalCleanup();
0216:                if (this .hostsBytes != null) {
0217:                    this .hostsBytes.clear();
0218:                    this .hostsBytes = null;
0219:                }
0220:                if (this .hostsDistribution != null) {
0221:                    this .hostsDistribution.clear();
0222:                    this .hostsDistribution = null;
0223:                }
0224:                if (this .hostsLastFinished != null) {
0225:                    this .hostsLastFinished.clear();
0226:                    this .hostsLastFinished = null;
0227:                }
0228:                if (this .processedSeedsRecords != null) {
0229:                    this .processedSeedsRecords.clear();
0230:                    this .processedSeedsRecords = null;
0231:                }
0232:                if (this .sourceHostDistribution != null) {
0233:                    this .sourceHostDistribution.clear();
0234:                    this .sourceHostDistribution = null;
0235:                }
0236:
0237:            }
0238:
0239:            protected synchronized void progressStatisticsEvent(
0240:                    final EventObject e) {
0241:                // This method loads "snapshot" data.
0242:                discoveredUriCount = discoveredUriCount();
0243:                downloadedUriCount = successfullyFetchedCount();
0244:                finishedUriCount = finishedUriCount();
0245:                queuedUriCount = queuedUriCount();
0246:                downloadFailures = failedFetchAttempts();
0247:                downloadDisregards = disregardedFetchAttempts();
0248:                totalProcessedBytes = totalBytesCrawled();
0249:                congestionRatio = congestionRatio();
0250:                deepestUri = deepestUri();
0251:                averageDepth = averageDepth();
0252:
0253:                if (finishedUriCount() == 0) {
0254:                    docsPerSecond = 0;
0255:                    totalKBPerSec = 0;
0256:                } else if (getCrawlerTotalElapsedTime() < 1000) {
0257:                    return; // Not enough time has passed for a decent snapshot.
0258:                } else {
0259:                    docsPerSecond = (double) downloadedUriCount
0260:                            / (double) (getCrawlerTotalElapsedTime() / 1000);
0261:                    // Round to nearest long.
0262:                    totalKBPerSec = (long) (((totalProcessedBytes / 1024) / ((getCrawlerTotalElapsedTime()) / 1000)) + .5);
0263:                }
0264:
0265:                busyThreads = activeThreadCount();
0266:
0267:                if (shouldrun
0268:                        || (System.currentTimeMillis() - lastLogPointTime) >= 1000) {
0269:                    // If shouldrun is false there is a chance that the time interval
0270:                    // since last time is too small for a good sample.  We only want
0271:                    // to update "current" data when the interval is long enough or
0272:                    // shouldrun is true.
0273:                    currentDocsPerSecond = 0;
0274:                    currentKBPerSec = 0;
0275:
0276:                    // Note time.
0277:                    long currentTime = System.currentTimeMillis();
0278:                    long sampleTime = currentTime - lastLogPointTime;
0279:
0280:                    // if we haven't done anyting or there isn't a reasonable sample
0281:                    // size give up.
0282:                    if (sampleTime >= 1000) {
0283:                        // Update docs/sec snapshot
0284:                        long currentPageCount = successfullyFetchedCount();
0285:                        long samplePageCount = currentPageCount
0286:                                - lastPagesFetchedCount;
0287:
0288:                        currentDocsPerSecond = (double) samplePageCount
0289:                                / (double) (sampleTime / 1000);
0290:
0291:                        lastPagesFetchedCount = currentPageCount;
0292:
0293:                        // Update kbytes/sec snapshot
0294:                        long currentProcessedBytes = totalProcessedBytes;
0295:                        long sampleProcessedBytes = currentProcessedBytes
0296:                                - lastProcessedBytesCount;
0297:
0298:                        currentKBPerSec = (int) (((sampleProcessedBytes / 1024) / (sampleTime / 1000)) + .5);
0299:
0300:                        lastProcessedBytesCount = currentProcessedBytes;
0301:                    }
0302:                }
0303:
0304:                if (this .controller != null) {
0305:                    this .controller
0306:                            .logProgressStatistics(getProgressStatisticsLine());
0307:                }
0308:                lastLogPointTime = System.currentTimeMillis();
0309:                super .progressStatisticsEvent(e);
0310:            }
0311:
0312:            /**
0313:             * Return one line of current progress-statistics
0314:             * 
0315:             * @param now
0316:             * @return String of stats
0317:             */
0318:            public String getProgressStatisticsLine(Date now) {
0319:                return new PaddingStringBuffer()
0320:                        .append(ArchiveUtils.getLog14Date(now))
0321:                        .raAppend(32, discoveredUriCount)
0322:                        .raAppend(44, queuedUriCount)
0323:                        .raAppend(57, downloadedUriCount)
0324:                        .raAppend(
0325:                                74,
0326:                                ArchiveUtils.doubleToString(
0327:                                        currentDocsPerSecond, 2)
0328:                                        + "("
0329:                                        + ArchiveUtils.doubleToString(
0330:                                                docsPerSecond, 2) + ")")
0331:                        .raAppend(85,
0332:                                currentKBPerSec + "(" + totalKBPerSec + ")")
0333:                        .raAppend(99, downloadFailures)
0334:                        .raAppend(113, busyThreads)
0335:                        .raAppend(
0336:                                126,
0337:                                (Runtime.getRuntime().totalMemory() - Runtime
0338:                                        .getRuntime().freeMemory()) / 1024)
0339:                        .raAppend(140,
0340:                                Runtime.getRuntime().totalMemory() / 1024)
0341:                        .raAppend(153,
0342:                                ArchiveUtils.doubleToString(congestionRatio, 2))
0343:                        .raAppend(165, deepestUri).raAppend(177, averageDepth)
0344:                        .toString();
0345:            }
0346:
0347:            public Map<String, Number> getProgressStatistics() {
0348:                Map<String, Number> stats = new HashMap<String, Number>();
0349:                stats.put("discoveredUriCount", new Long(discoveredUriCount));
0350:                stats.put("queuedUriCount", new Long(queuedUriCount));
0351:                stats.put("downloadedUriCount", new Long(downloadedUriCount));
0352:                stats.put("currentDocsPerSecond", new Double(
0353:                        currentDocsPerSecond));
0354:                stats.put("docsPerSecond", new Double(docsPerSecond));
0355:                stats.put("totalKBPerSec", new Long(totalKBPerSec));
0356:                stats.put("totalProcessedBytes", new Long(totalProcessedBytes));
0357:                stats.put("currentKBPerSec", new Long(currentKBPerSec));
0358:                stats.put("downloadFailures", new Long(downloadFailures));
0359:                stats.put("busyThreads", new Integer(busyThreads));
0360:                stats.put("congestionRatio", new Double(congestionRatio));
0361:                stats.put("deepestUri", new Long(deepestUri));
0362:                stats.put("averageDepth", new Long(averageDepth));
0363:                stats.put("totalMemory", new Long(Runtime.getRuntime()
0364:                        .totalMemory()));
0365:                stats.put("freeMemory", new Long(Runtime.getRuntime()
0366:                        .freeMemory()));
0367:                return stats;
0368:            }
0369:
0370:            /**
0371:             * Return one line of current progress-statistics
0372:             * 
0373:             * @return String of stats
0374:             */
0375:            public String getProgressStatisticsLine() {
0376:                return getProgressStatisticsLine(new Date());
0377:            }
0378:
0379:            public double processedDocsPerSec() {
0380:                return docsPerSecond;
0381:            }
0382:
0383:            public double currentProcessedDocsPerSec() {
0384:                return currentDocsPerSecond;
0385:            }
0386:
0387:            public long processedKBPerSec() {
0388:                return totalKBPerSec;
0389:            }
0390:
0391:            public int currentProcessedKBPerSec() {
0392:                return currentKBPerSec;
0393:            }
0394:
0395:            /** Returns a HashMap that contains information about distributions of
0396:             *  encountered mime types.  Key/value pairs represent
0397:             *  mime type -> count.
0398:             * <p>
0399:             * <b>Note:</b> All the values are wrapped with a {@link LongWrapper LongWrapper}
0400:             * @return mimeTypeDistribution
0401:             */
0402:            public Hashtable<String, LongWrapper> getFileDistribution() {
0403:                return mimeTypeDistribution;
0404:            }
0405:
0406:            /**
0407:             * Increment a counter for a key in a given HashMap. Used for various
0408:             * aggregate data.
0409:             * 
0410:             * As this is used to change Maps which depend on StatisticsTracker
0411:             * for their synchronization, this method should only be invoked
0412:             * from a a block synchronized on 'this'. 
0413:             *
0414:             * @param map The HashMap
0415:             * @param key The key for the counter to be incremented, if it does not
0416:             *               exist it will be added (set to 1).  If null it will
0417:             *            increment the counter "unknown".
0418:             */
0419:            protected static void incrementMapCount(
0420:                    Map<String, LongWrapper> map, String key) {
0421:                incrementMapCount(map, key, 1);
0422:            }
0423:
0424:            /**
0425:             * Increment a counter for a key in a given HashMap by an arbitrary amount.
0426:             * Used for various aggregate data. The increment amount can be negative.
0427:             *
0428:             * As this is used to change Maps which depend on StatisticsTracker
0429:             * for their synchronization, this method should only be invoked
0430:             * from a a block synchronized on 'this'. 
0431:             *
0432:             * @param map
0433:             *            The HashMap
0434:             * @param key
0435:             *            The key for the counter to be incremented, if it does not exist
0436:             *            it will be added (set to equal to <code>increment</code>).
0437:             *            If null it will increment the counter "unknown".
0438:             * @param increment
0439:             *            The amount to increment counter related to the <code>key</code>.
0440:             */
0441:            protected static void incrementMapCount(
0442:                    Map<String, LongWrapper> map, String key, long increment) {
0443:                if (key == null) {
0444:                    key = "unknown";
0445:                }
0446:                LongWrapper lw = (LongWrapper) map.get(key);
0447:                if (lw == null) {
0448:                    map.put(key, new LongWrapper(increment));
0449:                } else {
0450:                    lw.longValue += increment;
0451:                }
0452:            }
0453:
0454:            /**
0455:             * Sort the entries of the given HashMap in descending order by their
0456:             * values, which must be longs wrapped with <code>LongWrapper</code>.
0457:             * <p>
0458:             * Elements are sorted by value from largest to smallest. Equal values are
0459:             * sorted in an arbitrary, but consistent manner by their keys. Only items
0460:             * with identical value and key are considered equal.
0461:             *
0462:             * If the passed-in map requires access to be synchronized, the caller
0463:             * should ensure this synchronization. 
0464:             * 
0465:             * @param mapOfLongWrapperValues
0466:             *            Assumes values are wrapped with LongWrapper.
0467:             * @return a sorted set containing the same elements as the map.
0468:             */
0469:            public TreeMap<String, LongWrapper> getReverseSortedCopy(
0470:                    final Map<String, LongWrapper> mapOfLongWrapperValues) {
0471:                TreeMap<String, LongWrapper> sortedMap = new TreeMap<String, LongWrapper>(
0472:                        new Comparator<String>() {
0473:                            public int compare(String e1, String e2) {
0474:                                long firstVal = mapOfLongWrapperValues.get(e1).longValue;
0475:                                long secondVal = mapOfLongWrapperValues.get(e2).longValue;
0476:                                if (firstVal < secondVal) {
0477:                                    return 1;
0478:                                }
0479:                                if (secondVal < firstVal) {
0480:                                    return -1;
0481:                                }
0482:                                // If the values are the same, sort by keys.
0483:                                return e1.compareTo(e2);
0484:                            }
0485:                        });
0486:                try {
0487:                    sortedMap.putAll(mapOfLongWrapperValues);
0488:                } catch (UnsupportedOperationException e) {
0489:                    Iterator<String> i = mapOfLongWrapperValues.keySet()
0490:                            .iterator();
0491:                    for (; i.hasNext();) {
0492:                        // Ok. Try doing it the slow way then.
0493:                        String key = i.next();
0494:                        sortedMap.put(key, mapOfLongWrapperValues.get(key));
0495:                    }
0496:                }
0497:                return sortedMap;
0498:            }
0499:
0500:            /**
0501:             * Return a HashMap representing the distribution of status codes for
0502:             * successfully fetched curis, as represented by a hashmap where key -&gt;
0503:             * val represents (string)code -&gt; (integer)count.
0504:             * 
0505:             * <b>Note: </b> All the values are wrapped with a
0506:             * {@link LongWrapper LongWrapper}
0507:             * 
0508:             * @return statusCodeDistribution
0509:             */
0510:            public Hashtable<String, LongWrapper> getStatusCodeDistribution() {
0511:                return statusCodeDistribution;
0512:            }
0513:
0514:            /**
0515:             * Returns the time (in millisec) when a URI belonging to a given host was
0516:             * last finished processing. 
0517:             * 
0518:             * @param host The host to look up time of last completed URI.
0519:             * @return Returns the time (in millisec) when a URI belonging to a given 
0520:             * host was last finished processing. If no URI has been completed for host
0521:             * -1 will be returned. 
0522:             */
0523:            public long getHostLastFinished(String host) {
0524:                Long l = null;
0525:                synchronized (hostsLastFinished) {
0526:                    l = (Long) hostsLastFinished.get(host);
0527:                }
0528:                return (l != null) ? l.longValue() : -1;
0529:            }
0530:
0531:            /**
0532:             * Returns the accumulated number of bytes downloaded from a given host.
0533:             * @param host name of the host
0534:             * @return the accumulated number of bytes downloaded from a given host
0535:             */
0536:            public long getBytesPerHost(String host) {
0537:                synchronized (hostsBytes) {
0538:                    return ((LongWrapper) hostsBytes.get(host)).longValue;
0539:                }
0540:            }
0541:
0542:            /**
0543:             * Returns the accumulated number of bytes from files of a given file type.
0544:             * @param filetype Filetype to check.
0545:             * @return the accumulated number of bytes from files of a given mime type
0546:             */
0547:            public long getBytesPerFileType(String filetype) {
0548:                return ((LongWrapper) mimeTypeBytes.get(filetype)).longValue;
0549:            }
0550:
0551:            /**
0552:             * Get the total number of ToeThreads (sleeping and active)
0553:             *
0554:             * @return The total number of ToeThreads
0555:             */
0556:            public int threadCount() {
0557:                return this .controller != null ? controller.getToeCount() : 0;
0558:            }
0559:
0560:            /**
0561:             * @return Current thread count (or zero if can't figure it out).
0562:             */
0563:            public int activeThreadCount() {
0564:                return this .controller != null ? controller.getActiveToeCount()
0565:                        : 0;
0566:                // note: reuse of old busy value seemed misleading: anyone asking
0567:                // for thread count when paused or stopped still wants accurate reading
0568:            }
0569:
0570:            /**
0571:             * This returns the number of completed URIs as a percentage of the total
0572:             * number of URIs encountered (should be inverse to the discovery curve)
0573:             *
0574:             * @return The number of completed URIs as a percentage of the total
0575:             * number of URIs encountered
0576:             */
0577:            public int percentOfDiscoveredUrisCompleted() {
0578:                long completed = finishedUriCount();
0579:                long total = discoveredUriCount();
0580:
0581:                if (total == 0) {
0582:                    return 0;
0583:                }
0584:
0585:                return (int) (100 * completed / total);
0586:            }
0587:
0588:            /**
0589:             * Number of <i>discovered</i> URIs.
0590:             *
0591:             * <p>If crawl not running (paused or stopped) this will return the value of
0592:             * the last snapshot.
0593:             *
0594:             * @return A count of all uris encountered
0595:             *
0596:             * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
0597:             */
0598:            public long discoveredUriCount() {
0599:                // While shouldrun is true we can use info direct from the crawler.
0600:                // After that our last snapshot will have to do.
0601:                return shouldrun && this .controller != null
0602:                        && this .controller.getFrontier() != null ? controller
0603:                        .getFrontier().discoveredUriCount()
0604:                        : discoveredUriCount;
0605:            }
0606:
0607:            /**
0608:             * Number of URIs that have <i>finished</i> processing.
0609:             *
0610:             * @return Number of URIs that have finished processing
0611:             *
0612:             * @see org.archive.crawler.framework.Frontier#finishedUriCount()
0613:             */
0614:            public long finishedUriCount() {
0615:                return shouldrun && this .controller != null
0616:                        && this .controller.getFrontier() != null ? controller
0617:                        .getFrontier().finishedUriCount() : finishedUriCount;
0618:            }
0619:
0620:            /**
0621:             * Get the total number of failed fetch attempts (connection failures -> give up, etc)
0622:             *
0623:             * @return The total number of failed fetch attempts
0624:             */
0625:            public long failedFetchAttempts() {
0626:                // While shouldrun is true we can use info direct from the crawler.
0627:                // After that our last snapshot will have to do.
0628:                return shouldrun && this .controller != null
0629:                        && this .controller.getFrontier() != null ? controller
0630:                        .getFrontier().failedFetchCount() : downloadFailures;
0631:            }
0632:
0633:            /**
0634:             * Get the total number of failed fetch attempts (connection failures -> give up, etc)
0635:             *
0636:             * @return The total number of failed fetch attempts
0637:             */
0638:            public long disregardedFetchAttempts() {
0639:                // While shouldrun is true we can use info direct from the crawler.
0640:                // After that our last snapshot will have to do.
0641:                return shouldrun && this .controller != null
0642:                        && this .controller.getFrontier() != null ? controller
0643:                        .getFrontier().disregardedUriCount()
0644:                        : downloadDisregards;
0645:            }
0646:
0647:            public long successfullyFetchedCount() {
0648:                // While shouldrun is true we can use info direct from the crawler.
0649:                // After that our last snapshot will have to do.
0650:                return shouldrun && this .controller != null
0651:                        && this .controller.getFrontier() != null ? controller
0652:                        .getFrontier().succeededFetchCount()
0653:                        : downloadedUriCount;
0654:            }
0655:
0656:            public long totalCount() {
0657:                return queuedUriCount() + activeThreadCount()
0658:                        + successfullyFetchedCount();
0659:            }
0660:
0661:            /**
0662:             * Ratio of number of threads that would theoretically allow
0663:             * maximum crawl progress (if each was as productive as current
0664:             * threads), to current number of threads.
0665:             * 
0666:             * @return float congestion ratio 
0667:             */
0668:            public float congestionRatio() {
0669:                // While shouldrun is true we can use info direct from the crawler.
0670:                // After that our last snapshot will have to do.
0671:                return shouldrun && this .controller != null
0672:                        && this .controller.getFrontier() != null ? controller
0673:                        .getFrontier().congestionRatio() : congestionRatio;
0674:            }
0675:
0676:            /**
0677:             * Ordinal position of the 'deepest' URI eligible 
0678:             * for crawling. Essentially, the length of the longest
0679:             * frontier internal queue. 
0680:             * 
0681:             * @return long URI count to deepest URI
0682:             */
0683:            public long deepestUri() {
0684:                // While shouldrun is true we can use info direct from the crawler.
0685:                // After that our last snapshot will have to do.
0686:                return shouldrun && this .controller != null
0687:                        && this .controller.getFrontier() != null ? controller
0688:                        .getFrontier().deepestUri() : deepestUri;
0689:            }
0690:
0691:            /**
0692:             * Average depth of the last URI in all eligible queues.
0693:             * That is, the average length of all eligible queues.
0694:             * 
0695:             * @return long average depth of last URIs in queues 
0696:             */
0697:            public long averageDepth() {
0698:                // While shouldrun is true we can use info direct from the crawler.
0699:                // After that our last snapshot will have to do.
0700:                return shouldrun && this .controller != null
0701:                        && this .controller.getFrontier() != null ? controller
0702:                        .getFrontier().averageDepth() : averageDepth;
0703:            }
0704:
0705:            /**
0706:             * Number of URIs <i>queued</i> up and waiting for processing.
0707:             *
0708:             * <p>If crawl not running (paused or stopped) this will return the value
0709:             * of the last snapshot.
0710:             *
0711:             * @return Number of URIs queued up and waiting for processing.
0712:             *
0713:             * @see org.archive.crawler.framework.Frontier#queuedUriCount()
0714:             */
0715:            public long queuedUriCount() {
0716:                // While shouldrun is true we can use info direct from the crawler.
0717:                // After that our last snapshot will have to do.
0718:                return shouldrun && this .controller != null
0719:                        && this .controller.getFrontier() != null ? controller
0720:                        .getFrontier().queuedUriCount() : queuedUriCount;
0721:            }
0722:
0723:            /** @deprecated use totalBytesCrawled */
0724:            public long totalBytesWritten() {
0725:                // return totalBytesCrawled(); 
0726:                return shouldrun && this .controller != null
0727:                        && this .controller.getFrontier() != null ? controller
0728:                        .getFrontier().totalBytesWritten()
0729:                        : totalProcessedBytes;
0730:            }
0731:
0732:            public long totalBytesCrawled() {
0733:                return shouldrun ? crawledBytes.getTotal()
0734:                        : totalProcessedBytes;
0735:            }
0736:
0737:            public String crawledBytesSummary() {
0738:                return crawledBytes.summary();
0739:            }
0740:
0741:            /**
0742:             * If the curi is a seed, we update the processedSeeds table.
0743:             *
0744:             * @param curi The CrawlURI that may be a seed.
0745:             * @param disposition The dispositino of the CrawlURI.
0746:             */
0747:            private void handleSeed(CrawlURI curi, String disposition) {
0748:                if (curi.isSeed()) {
0749:                    SeedRecord sr = new SeedRecord(curi, disposition);
0750:                    processedSeedsRecords.put(sr.getUri(), sr);
0751:                }
0752:            }
0753:
0754:            public void crawledURISuccessful(CrawlURI curi) {
0755:                handleSeed(curi, SEED_DISPOSITION_SUCCESS);
0756:                // save crawled bytes tally
0757:                crawledBytes.accumulate(curi);
0758:
0759:                // Save status codes
0760:                incrementMapCount(statusCodeDistribution, Integer.toString(curi
0761:                        .getFetchStatus()));
0762:
0763:                // Save mime types
0764:                String mime = MimetypeUtils.truncate(curi.getContentType());
0765:                incrementMapCount(mimeTypeDistribution, mime);
0766:                incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
0767:
0768:                // Save hosts stats.
0769:                saveHostStats((curi.getFetchStatus() == 1) ? "dns:"
0770:                        : this .controller.getServerCache().getHostFor(curi)
0771:                                .getHostName(), curi.getContentSize());
0772:
0773:                if (curi.containsKey(CrawlURI.A_SOURCE_TAG)) {
0774:                    saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG),
0775:                            this .controller.getServerCache().getHostFor(curi)
0776:                                    .getHostName());
0777:                }
0778:            }
0779:
0780:            protected void saveSourceStats(String source, String hostname) {
0781:                synchronized (sourceHostDistribution) {
0782:                    HashMap<String, LongWrapper> hostUriCount = sourceHostDistribution
0783:                            .get(source);
0784:                    if (hostUriCount == null) {
0785:                        hostUriCount = new HashMap<String, LongWrapper>();
0786:                    }
0787:                    // TODO: Dan suggests we don't need a hashtable value.  Might
0788:                    // be faster if we went without. Could just have keys of:
0789:                    //  seed | host (concatenated as string)
0790:                    // and values of: 
0791:                    //  #urls
0792:                    incrementMapCount(hostUriCount, hostname);
0793:                    sourceHostDistribution.put(source, hostUriCount);
0794:                }
0795:            }
0796:
0797:            protected void saveHostStats(String hostname, long size) {
0798:                synchronized (hostsDistribution) {
0799:                    incrementMapCount(hostsDistribution, hostname);
0800:                }
0801:                synchronized (hostsBytes) {
0802:                    incrementMapCount(hostsBytes, hostname, size);
0803:                }
0804:                synchronized (hostsLastFinished) {
0805:                    hostsLastFinished.put(hostname, new Long(System
0806:                            .currentTimeMillis()));
0807:                }
0808:            }
0809:
0810:            public void crawledURINeedRetry(CrawlURI curi) {
0811:                handleSeed(curi, SEED_DISPOSITION_RETRY);
0812:            }
0813:
0814:            public void crawledURIDisregard(CrawlURI curi) {
0815:                handleSeed(curi, SEED_DISPOSITION_DISREGARD);
0816:            }
0817:
0818:            public void crawledURIFailure(CrawlURI curi) {
0819:                handleSeed(curi, SEED_DISPOSITION_FAILURE);
0820:            }
0821:
0822:            /**
0823:             * Get a seed iterator for the job being monitored. 
0824:             * 
0825:             * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
0826:             * UURIs like the Scope seed iterator. The strings are equal to the URIs'
0827:             * getURIString() values.
0828:             * @return the seed iterator
0829:             * FIXME: Consider using TransformingIterator here
0830:             */
0831:            public Iterator<String> getSeeds() {
0832:                List<String> seedsCopy = new Vector<String>();
0833:                Iterator<UURI> i = controller.getScope().seedsIterator();
0834:                while (i.hasNext()) {
0835:                    seedsCopy.add(i.next().toString());
0836:                }
0837:                return seedsCopy.iterator();
0838:            }
0839:
0840:            public Iterator getSeedRecordsSortedByStatusCode() {
0841:                return getSeedRecordsSortedByStatusCode(getSeeds());
0842:            }
0843:
0844:            protected Iterator<SeedRecord> getSeedRecordsSortedByStatusCode(
0845:                    Iterator<String> i) {
0846:                TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(
0847:                        new Comparator<SeedRecord>() {
0848:                            public int compare(SeedRecord sr1, SeedRecord sr2) {
0849:                                int code1 = sr1.getStatusCode();
0850:                                int code2 = sr2.getStatusCode();
0851:                                if (code1 == code2) {
0852:                                    // If the values are equal, sort by URIs.
0853:                                    return sr1.getUri().compareTo(sr2.getUri());
0854:                                }
0855:                                // mirror and shift the nubmer line so as to
0856:                                // place zero at the beginning, then all negatives 
0857:                                // in order of ascending absolute value, then all 
0858:                                // positives descending
0859:                                code1 = -code1 - Integer.MAX_VALUE;
0860:                                code2 = -code2 - Integer.MAX_VALUE;
0861:
0862:                                return new Integer(code1)
0863:                                        .compareTo(new Integer(code2));
0864:                            }
0865:                        });
0866:                while (i.hasNext()) {
0867:                    String seed = i.next();
0868:                    SeedRecord sr = (SeedRecord) processedSeedsRecords
0869:                            .get(seed);
0870:                    if (sr == null) {
0871:                        sr = new SeedRecord(seed,
0872:                                SEED_DISPOSITION_NOT_PROCESSED);
0873:                        processedSeedsRecords.put(seed, sr);
0874:                    }
0875:                    sortedSet.add(sr);
0876:                }
0877:                return sortedSet.iterator();
0878:            }
0879:
0880:            public void crawlEnded(String message) {
0881:                logger.info("Entered crawlEnded");
0882:                this .sExitMessage = message; // held for reference by reports
0883:                super .crawlEnded(message);
0884:                logger.info("Leaving crawlEnded");
0885:            }
0886:
0887:            /**
0888:             * @param writer Where to write.
0889:             */
0890:            protected void writeSeedsReportTo(PrintWriter writer) {
0891:                // Build header.
0892:                writer.print("[code] [status] [seed] [redirect]\n");
0893:
0894:                seedsCrawled = 0;
0895:                seedsNotCrawled = 0;
0896:                for (Iterator i = getSeedRecordsSortedByStatusCode(getSeeds()); i
0897:                        .hasNext();) {
0898:                    SeedRecord sr = (SeedRecord) i.next();
0899:                    writer.print(sr.getStatusCode());
0900:                    writer.print(" ");
0901:                    if ((sr.getStatusCode() > 0)) {
0902:                        seedsCrawled++;
0903:                        writer.print("CRAWLED");
0904:                    } else {
0905:                        seedsNotCrawled++;
0906:                        writer.print("NOTCRAWLED");
0907:                    }
0908:                    writer.print(" ");
0909:                    writer.print(sr.getUri());
0910:                    if (sr.getRedirectUri() != null) {
0911:                        writer.print(" ");
0912:                        writer.print(sr.getRedirectUri());
0913:                    }
0914:                    writer.print("\n");
0915:                }
0916:            }
0917:
0918:            protected void writeSourceReportTo(PrintWriter writer) {
0919:
0920:                writer.print("[source] [host] [#urls]\n");
0921:                // for each source
0922:                for (Iterator i = sourceHostDistribution.keySet().iterator(); i
0923:                        .hasNext();) {
0924:                    Object sourceKey = i.next();
0925:                    Map<String, LongWrapper> hostCounts = (Map<String, LongWrapper>) sourceHostDistribution
0926:                            .get(sourceKey);
0927:                    // sort hosts by #urls
0928:                    SortedMap sortedHostCounts = getReverseSortedHostCounts(hostCounts);
0929:                    // for each host
0930:                    for (Iterator j = sortedHostCounts.keySet().iterator(); j
0931:                            .hasNext();) {
0932:                        Object hostKey = j.next();
0933:                        LongWrapper hostCount = (LongWrapper) hostCounts
0934:                                .get(hostKey);
0935:                        writer.print(sourceKey.toString());
0936:                        writer.print(" ");
0937:                        writer.print(hostKey.toString());
0938:                        writer.print(" ");
0939:                        writer.print(hostCount.longValue);
0940:                        writer.print("\n");
0941:                    }
0942:                }
0943:            }
0944:
0945:            /**
0946:             * Return a copy of the hosts distribution in reverse-sorted (largest first)
0947:             * order.
0948:             * 
0949:             * @return SortedMap of hosts distribution
0950:             */
0951:            public SortedMap getReverseSortedHostCounts(
0952:                    Map<String, LongWrapper> hostCounts) {
0953:                synchronized (hostCounts) {
0954:                    return getReverseSortedCopy(hostCounts);
0955:                }
0956:            }
0957:
0958:            protected void writeHostsReportTo(PrintWriter writer) {
0959:                SortedMap hd = getReverseSortedHostsDistribution();
0960:                // header
0961:                writer.print("[#urls] [#bytes] [host]\n");
0962:                for (Iterator i = hd.keySet().iterator(); i.hasNext();) {
0963:                    // Key is 'host'.
0964:                    Object key = i.next();
0965:                    if (hd.get(key) != null) {
0966:                        writer.print(((LongWrapper) hd.get(key)).longValue);
0967:                    } else {
0968:                        writer.print("-");
0969:                    }
0970:                    writer.print(" ");
0971:                    writer.print(getBytesPerHost((String) key));
0972:                    writer.print(" ");
0973:                    writer.print((String) key);
0974:                    writer.print("\n");
0975:                }
0976:            }
0977:
0978:            /**
0979:             * Return a copy of the hosts distribution in reverse-sorted
0980:             * (largest first) order. 
0981:             * @return SortedMap of hosts distribution
0982:             */
0983:            public SortedMap getReverseSortedHostsDistribution() {
0984:                synchronized (hostsDistribution) {
0985:                    return getReverseSortedCopy(hostsDistribution);
0986:                }
0987:            }
0988:
0989:            protected void writeMimetypesReportTo(PrintWriter writer) {
0990:                // header
0991:                writer.print("[#urls] [#bytes] [mime-types]\n");
0992:                TreeMap fd = getReverseSortedCopy(getFileDistribution());
0993:                for (Iterator i = fd.keySet().iterator(); i.hasNext();) {
0994:                    Object key = i.next();
0995:                    // Key is mime type.
0996:                    writer.print(Long
0997:                            .toString(((LongWrapper) fd.get(key)).longValue));
0998:                    writer.print(" ");
0999:                    writer.print(Long
1000:                            .toString(getBytesPerFileType((String) key)));
1001:                    writer.print(" ");
1002:                    writer.print((String) key);
1003:                    writer.print("\n");
1004:                }
1005:            }
1006:
1007:            protected void writeResponseCodeReportTo(PrintWriter writer) {
1008:                // Build header.
1009:                writer.print("[rescode] [#urls]\n");
1010:                TreeMap scd = getReverseSortedCopy(getStatusCodeDistribution());
1011:                for (Iterator i = scd.keySet().iterator(); i.hasNext();) {
1012:                    Object key = i.next();
1013:                    writer.print((String) key);
1014:                    writer.print(" ");
1015:                    writer.print(Long
1016:                            .toString(((LongWrapper) scd.get(key)).longValue));
1017:                    writer.print("\n");
1018:                }
1019:            }
1020:
1021:            protected void writeCrawlReportTo(PrintWriter writer) {
1022:                writer.print("Crawl Name: "
1023:                        + controller.getOrder().getCrawlOrderName());
1024:                writer.print("\nCrawl Status: " + sExitMessage);
1025:                writer
1026:                        .print("\nDuration Time: "
1027:                                + ArchiveUtils
1028:                                        .formatMillisecondsToConventional(crawlDuration()));
1029:                writer.print("\nTotal Seeds Crawled: " + seedsCrawled);
1030:                writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled);
1031:                // hostsDistribution contains all hosts crawled plus an entry for dns.
1032:                writer.print("\nTotal Hosts Crawled: "
1033:                        + (hostsDistribution.size() - 1));
1034:                writer.print("\nTotal Documents Crawled: " + finishedUriCount);
1035:                writer.print("\nProcessed docs/sec: "
1036:                        + ArchiveUtils.doubleToString(docsPerSecond, 2));
1037:                writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec);
1038:                writer.print("\nTotal Raw Data Size in Bytes: "
1039:                        + totalProcessedBytes
1040:                        + " ("
1041:                        + ArchiveUtils
1042:                                .formatBytesForDisplay(totalProcessedBytes)
1043:                        + ") \n");
1044:                writer.print("Novel Bytes: "
1045:                        + crawledBytes.get(CrawledBytesHistotable.NOVEL)
1046:                        + " ("
1047:                        + ArchiveUtils.formatBytesForDisplay(crawledBytes
1048:                                .get(CrawledBytesHistotable.NOVEL)) + ") \n");
1049:                if (crawledBytes.containsKey(CrawledBytesHistotable.DUPLICATE)) {
1050:                    writer.print("Duplicate-by-hash Bytes: "
1051:                            + crawledBytes
1052:                                    .get(CrawledBytesHistotable.DUPLICATE)
1053:                            + " ("
1054:                            + ArchiveUtils.formatBytesForDisplay(crawledBytes
1055:                                    .get(CrawledBytesHistotable.DUPLICATE))
1056:                            + ") \n");
1057:                }
1058:                if (crawledBytes
1059:                        .containsKey(CrawledBytesHistotable.NOTMODIFIED)) {
1060:                    writer.print("Not-modified Bytes: "
1061:                            + crawledBytes
1062:                                    .get(CrawledBytesHistotable.NOTMODIFIED)
1063:                            + " ("
1064:                            + ArchiveUtils.formatBytesForDisplay(crawledBytes
1065:                                    .get(CrawledBytesHistotable.NOTMODIFIED))
1066:                            + ") \n");
1067:                }
1068:            }
1069:
1070:            protected void writeProcessorsReportTo(PrintWriter writer) {
1071:                controller.reportTo(CrawlController.PROCESSORS_REPORT, writer);
1072:            }
1073:
1074:            protected void writeReportFile(String reportName, String filename) {
1075:                File f = new File(controller.getDisk().getPath(), filename);
1076:                try {
1077:                    PrintWriter bw = new PrintWriter(new FileWriter(f));
1078:                    writeReportTo(reportName, bw);
1079:                    bw.close();
1080:                    controller.addToManifest(f.getAbsolutePath(),
1081:                            CrawlController.MANIFEST_REPORT_FILE, true);
1082:                } catch (IOException e) {
1083:                    logger.log(Level.SEVERE, "Unable to write "
1084:                            + f.getAbsolutePath() + " at the end of crawl.", e);
1085:                }
1086:                logger.info("wrote report: " + f.getAbsolutePath());
1087:            }
1088:
1089:            /**
1090:             * @param writer Where to write.
1091:             */
1092:            protected void writeManifestReportTo(PrintWriter writer) {
1093:                controller.reportTo(CrawlController.MANIFEST_REPORT, writer);
1094:            }
1095:
1096:            /**
1097:             * @param reportName Name of report.
1098:             * @param w Where to write.
1099:             */
1100:            private void writeReportTo(String reportName, PrintWriter w) {
1101:                if ("hosts".equals(reportName)) {
1102:                    writeHostsReportTo(w);
1103:                } else if ("mime types".equals(reportName)) {
1104:                    writeMimetypesReportTo(w);
1105:                } else if ("response codes".equals(reportName)) {
1106:                    writeResponseCodeReportTo(w);
1107:                } else if ("seeds".equals(reportName)) {
1108:                    writeSeedsReportTo(w);
1109:                } else if ("crawl".equals(reportName)) {
1110:                    writeCrawlReportTo(w);
1111:                } else if ("processors".equals(reportName)) {
1112:                    writeProcessorsReportTo(w);
1113:                } else if ("manifest".equals(reportName)) {
1114:                    writeManifestReportTo(w);
1115:                } else if ("frontier".equals(reportName)) {
1116:                    writeFrontierReportTo(w);
1117:                } else if ("source".equals(reportName)) {
1118:                    writeSourceReportTo(w);
1119:                }// / TODO else default/error
1120:            }
1121:
1122:            /**
1123:             * Write the Frontier's 'nonempty' report (if available)
1124:             * @param writer to report to
1125:             */
1126:            protected void writeFrontierReportTo(PrintWriter writer) {
1127:                if (controller.getFrontier().isEmpty()) {
1128:                    writer.println("frontier empty");
1129:                } else {
1130:                    controller.getFrontier().reportTo("nonempty", writer);
1131:                }
1132:            }
1133:
1134:            /**
1135:             * Run the reports.
1136:             */
1137:            public void dumpReports() {
1138:                // Add all files mentioned in the crawl order to the
1139:                // manifest set.
1140:                controller.addOrderToManifest();
1141:                writeReportFile("hosts", "hosts-report.txt");
1142:                writeReportFile("mime types", "mimetype-report.txt");
1143:                writeReportFile("response codes", "responsecode-report.txt");
1144:                writeReportFile("seeds", "seeds-report.txt");
1145:                writeReportFile("crawl", "crawl-report.txt");
1146:                writeReportFile("processors", "processors-report.txt");
1147:                writeReportFile("manifest", "crawl-manifest.txt");
1148:                writeReportFile("frontier", "frontier-report.txt");
1149:                if (!sourceHostDistribution.isEmpty()) {
1150:                    writeReportFile("source", "source-report.txt");
1151:                }
1152:                // TODO: Save object to disk?
1153:            }
1154:
1155:            public void crawlCheckpoint(File cpDir) throws Exception {
1156:                // CrawlController is managing the checkpointing of this object.
1157:                logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
1158:            }
1159:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.