Source Code Cross Referenced for AbstractTracker.java in » Web-Crawler » heritrix » org » archive » crawler » framework » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.framework

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         */
019:        package org.archive.crawler.framework;
020:
021:        import java.io.Serializable;
022:        import java.util.Date;
023:        import java.util.EventObject;
024:        import java.util.logging.Level;
025:
026:        import javax.management.AttributeNotFoundException;
027:
028:        import org.archive.crawler.event.CrawlStatusListener;
029:        import org.archive.crawler.framework.exceptions.FatalConfigurationException;
030:        import org.archive.crawler.settings.ModuleType;
031:        import org.archive.crawler.settings.SimpleType;
032:        import org.archive.crawler.settings.Type;
033:        import org.archive.util.ArchiveUtils;
034:        import org.archive.util.PaddingStringBuffer;
035:        import org.xbill.DNS.DClass;
036:        import org.xbill.DNS.Lookup;
037:
038:        /**
039:         * A partial implementation of the StatisticsTracking interface.
040:         * <p>
041:         * It covers the thread handling. (Launching, pausing etc.)  Included in this is
042:         * keeping track of the total time spent (actually) crawling.  Several methods
043:         * to access the time started, finished etc. are provided.
044:         * <p>
045:         * To handle the thread work the class implements the CrawlStatusListener and
046:         * uses it's events to pause, resume and stop logging of statistics. The run()
047:         * method will call logActivity() at intervals specified in the crawl order.
048:         * <p>
049:         * Implementation of logActivity (the actual logging) as well as listening for
050:         * CrawlURIDisposition events is not addressed.
051:         *
052:         * @author Kristinn Sigurdsson
053:         *
054:         * @see org.archive.crawler.framework.StatisticsTracking
055:         * @see org.archive.crawler.admin.StatisticsTracker
056:         */
057:        public abstract class AbstractTracker extends ModuleType implements 
058:                StatisticsTracking, CrawlStatusListener, Serializable {
059:            /** Default period between logging stat values */
060:            public static final Integer DEFAULT_STATISTICS_REPORT_INTERVAL = new Integer(
061:                    20);
062:            /** Attribute name for logging interval in seconds setting
063:             */
064:            public static final String ATTR_STATS_INTERVAL = "interval-seconds";
065:
066:            /** A reference to the CrawlContoller of the crawl that we are to track
067:             * statistics for.
068:             */
069:            protected transient CrawlController controller;
070:
071:            // Keep track of time.
072:            protected long crawlerStartTime;
073:            protected long crawlerEndTime = -1; // Until crawl ends, this value is -1.
074:            protected long crawlerPauseStarted = 0;
075:            protected long crawlerTotalPausedTime = 0;
076:
077:            /** Timestamp of when this logger last wrote something to the log */
078:            protected long lastLogPointTime;
079:
080:            protected boolean shouldrun = true;
081:
082:            /**
083:             * @param name
084:             * @param description
085:             */
086:            public AbstractTracker(String name, String description) {
087:                super (name, description);
088:                Type e = addElementToDefinition(new SimpleType(
089:                        ATTR_STATS_INTERVAL,
090:                        "The interval between writing progress information to log.",
091:                        DEFAULT_STATISTICS_REPORT_INTERVAL));
092:                e.setOverrideable(false);
093:            }
094:
095:            /**
096:             * Sets up the Logger (including logInterval) and registers with the
097:             * CrawlController for CrawlStatus and CrawlURIDisposition events.
098:             *
099:             * @param c A crawl controller instance.
100:             * @throws FatalConfigurationException Not thrown here. For overrides that
101:             * go to settings system for configuration.
102:             * @see CrawlStatusListener
103:             * @see org.archive.crawler.event.CrawlURIDispositionListener
104:             */
105:            public void initialize(CrawlController c)
106:                    throws FatalConfigurationException {
107:                this .controller = c;
108:
109:                // Add listeners
110:                this .controller.addCrawlStatusListener(this );
111:            }
112:
113:            /**
114:             * Start thread.  Will call logActivity() at intervals specified by
115:             * logInterval
116:             *
117:             */
118:            public void run() {
119:                // Don't start logging if we have no logger
120:                if (this .controller == null) {
121:                    return;
122:                }
123:
124:                shouldrun = true; //If we are starting, this should always be true.
125:
126:                // Log the legend
127:                this .controller
128:                        .logProgressStatistics(progressStatisticsLegend());
129:                lastLogPointTime = System.currentTimeMillis(); // The first interval begins now.
130:
131:                // Keep logging until someone calls stop()
132:                while (shouldrun) {
133:                    // Pause before writing the first entry (so we have real numbers)
134:                    // and then pause between entries
135:                    try {
136:                        Thread.sleep(getLogWriteInterval() * 1000);
137:                    } catch (InterruptedException e) {
138:                        e.printStackTrace();
139:                        controller.runtimeErrors
140:                                .log(Level.INFO,
141:                                        "Periodic stat logger interrupted while sleeping.");
142:                    }
143:
144:                    // In case stop() was invoked while the thread was sleeping or we
145:                    // are paused.
146:                    if (shouldrun && getCrawlPauseStartedTime() == 0) {
147:                        progressStatisticsEvent(new EventObject(this ));
148:                    }
149:                }
150:            }
151:
152:            /**
153:             * @return legend for progress-statistics lines/log
154:             */
155:            public String progressStatisticsLegend() {
156:                return "           timestamp"
157:                        + "  discovered   "
158:                        + "   queued   downloaded       doc/s(avg)  KB/s(avg) "
159:                        + "  dl-failures   busy-thread   mem-use-KB  heap-size-KB "
160:                        + "  congestion   max-depth   avg-depth";
161:            }
162:
163:            /**
164:             * Notify tracker that crawl has begun. Must be called
165:             * outside tracker's own thread, to ensure it is noted
166:             * before other threads start interacting with tracker. 
167:             */
168:            public void noteStart() {
169:                if (this .crawlerStartTime == 0) {
170:                    // Note the time the crawl starts (only if not already set)
171:                    this .crawlerStartTime = System.currentTimeMillis();
172:                }
173:            }
174:
175:            /**
176:             * A method for logging current crawler state.
177:             *
178:             * This method will be called by run() at intervals specified in
179:             * the crawl order file.  It is also invoked when pausing or
180:             * stopping a crawl to capture the state at that point.  Default behavior is
181:             * call to {@link CrawlController#logProgressStatistics} so CrawlController
182:             * can act on progress statistics event.
183:             * <p>
184:             * It is recommended that for implementations of this method it be
185:             * carefully considered if it should be synchronized in whole or in
186:             * part
187:             * @param e Progress statistics event.
188:             */
189:            protected synchronized void progressStatisticsEvent(
190:                    final EventObject e) {
191:                this .controller.progressStatisticsEvent(e);
192:                // temporary workaround for 
193:                // [ 996161 ] Fix DNSJava issues (memory) -- replace with JNDI-DNS?
194:                // http://sourceforge.net/support/tracker.php?aid=996161
195:                Lookup.getDefaultCache(DClass.IN).clearCache();
196:            }
197:
198:            /**
199:             * Get the starting time of the crawl (as given by
200:             * <code>System.currentTimeMillis()</code> when the crawl started).
201:             * @return time fo the crawl's start
202:             */
203:            public long getCrawlStartTime() {
204:                return this .crawlerStartTime;
205:            }
206:
207:            /**
208:             * If crawl has ended it will return the time it ended (given by
209:             * <code>System.currentTimeMillis()</code> at that time).
210:             * <br>
211:             * If crawl is still going on it will return the same as
212:             * <code>System.currentTimeMillis()</code> at the time of the call.
213:             * @return The time of the crawl ending or the current time if the crawl has
214:             *         not ended.
215:             */
216:            public long getCrawlEndTime() {
217:                return (this .crawlerEndTime == -1) ? System.currentTimeMillis()
218:                        : this .crawlerEndTime;
219:            }
220:
221:            /**
222:             * Returns the number of milliseconds that the crawl spent paused or
223:             * otherwise in a nonactive state.
224:             * @return the number of msec. that the crawl was paused or otherwise
225:             *         suspended.
226:             */
227:            public long getCrawlTotalPauseTime() {
228:                return this .crawlerTotalPausedTime;
229:            }
230:
231:            /**
232:             * Get the time when the the crawl was last paused/suspended (as given by
233:             * <code>System.currentTimeMillis()</code> at that time). Will be 0 if the
234:             * crawl is not currently paused.
235:             * @return time of the crawl's last pause/suspend or 0 if the crawl is not
236:             *         currently paused.
237:             */
238:            public long getCrawlPauseStartedTime() {
239:                return this .crawlerPauseStarted;
240:            }
241:
242:            public long getCrawlerTotalElapsedTime() {
243:                if (getCrawlStartTime() == 0) {
244:                    // if no start time set yet, consider elapsed time zero
245:                    return 0;
246:                }
247:
248:                return (getCrawlPauseStartedTime() != 0) ?
249:                // Are currently paused, calculate time up to last pause
250:                (getCrawlPauseStartedTime() - getCrawlTotalPauseTime() - getCrawlStartTime())
251:                        :
252:                        // Not paused, calculate total time.
253:                        (getCrawlEndTime() - getCrawlTotalPauseTime() - getCrawlStartTime());
254:            }
255:
256:            /**
257:             * The number of seconds to wait between writing snapshot data to log file.
258:             * @return the number of seconds to wait between writing snapshot data to
259:             * log file.
260:             */
261:            protected int getLogWriteInterval() {
262:                int logInterval;
263:                try {
264:                    logInterval = ((Integer) getAttribute(null,
265:                            ATTR_STATS_INTERVAL)).intValue();
266:                } catch (AttributeNotFoundException e) {
267:                    logInterval = 10;
268:                }
269:                return logInterval;
270:            }
271:
272:            /**
273:             * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
274:             */
275:            public void crawlPausing(String statusMessage) {
276:                logNote("CRAWL WAITING - " + statusMessage);
277:            }
278:
279:            protected void logNote(final String note) {
280:                this .controller.logProgressStatistics(new PaddingStringBuffer()
281:                        .append(ArchiveUtils.get14DigitDate()).append(" ")
282:                        .append(note).toString());
283:            }
284:
285:            public void crawlPaused(String statusMessage) {
286:                crawlerPauseStarted = System.currentTimeMillis();
287:                progressStatisticsEvent(new EventObject(this ));
288:                logNote("CRAWL PAUSED - " + statusMessage);
289:            }
290:
291:            public void crawlResuming(String statusMessage) {
292:                tallyCurrentPause();
293:                logNote("CRAWL RESUMED - " + statusMessage);
294:                lastLogPointTime = System.currentTimeMillis();
295:            }
296:
297:            /**
298:             * For a current pause (if any), add paused time to total and reset
299:             */
300:            protected void tallyCurrentPause() {
301:                if (this .crawlerPauseStarted > 0) {
302:                    // Ok, we managed to actually pause before resuming.
303:                    this .crawlerTotalPausedTime += (System.currentTimeMillis() - this .crawlerPauseStarted);
304:                }
305:                this .crawlerPauseStarted = 0;
306:            }
307:
308:            public void crawlEnding(String sExitMessage) {
309:                logNote("CRAWL ENDING - " + sExitMessage);
310:            }
311:
312:            /**
313:             * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
314:             */
315:            public void crawlEnded(String sExitMessage) {
316:                // Note the time when the crawl stops.
317:                crawlerEndTime = System.currentTimeMillis();
318:                progressStatisticsEvent(new EventObject(this ));
319:                logNote("CRAWL ENDED - " + sExitMessage);
320:                shouldrun = false;
321:                dumpReports();
322:                finalCleanup();
323:            }
324:
325:            public void crawlStarted(String message) {
326:                tallyCurrentPause();
327:                noteStart();
328:            }
329:
330:            /**
331:             * Dump reports, if any, on request or at crawl end. 
332:             */
333:            protected void dumpReports() {
334:                // by default do nothing; subclasses may override
335:            }
336:
337:            /**
338:             * Cleanup resources used, at crawl end. 
339:             */
340:            protected void finalCleanup() {
341:                controller = null; // Facilitate GC.
342:            }
343:
344:            /**
345:             * @see org.archive.crawler.framework.StatisticsTracking#crawlDuration()
346:             */
347:            public long crawlDuration() {
348:                return getCrawlerTotalElapsedTime();
349:            }
350:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.