001: /* URIFrontierHostStatistics
002: *
003: * $Id: FrontierHostStatistics.java 2509 2004-09-02 02:16:11Z gojomo $
004: *
005: * Created on Mar 30, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.framework;
026:
027: /**
028: * An optional interface the Frontiers can implement to provide information
029: * about specific hosts.
030: *
031: * <p>Some URIFrontier implmentations will want to provide a number of
032: * statistics relating to the progress of particular hosts. This only applies
033: * to those Frontiers whose internal structure uses hosts to split up the
034: * workload and (for example) implement politeness. Some other Frontiers may
035: * also provide this info based on calculations.
036: *
037: * <ul>
038: * <li> {@link #activeHosts() Active hosts}
039: * <li> {@link #inactiveHosts() Inactive hosts}
040: * <li> {@link #deferredHosts() deferred hosts}
041: * <li> {@link #inProcessHosts() In process hosts}
042: * <li> {@link #readyHosts() Ready hosts}
043: * <li> {@link #hostStatus(String) Host status}
044: * </ul>
045: *
046: * @author Kristinn Sigurdsson
047: *
048: * @see org.archive.crawler.framework.Frontier
049: */
050: public interface FrontierHostStatistics {
051:
052: /**
053: * Host has not been encountered by the Frontier, or has been encountered
054: * but has been inactive so long that it has expired.
055: */
056: public static final int HOST_UNKNOWN = 0;
057: /** Host has URIs ready to be emited. */
058: public static final int HOST_READY = 1;
059: /** Host has URIs currently being proessed. */
060: public static final int HOST_INPROCESS = 2;
061: /**
062: * Host has been deferred for some amount of time, will become ready once
063: * once that time has elapsed. This is most likely due to politeness or
064: * waiting between retries. Other conditions may exist.
065: */
066: public static final int HOST_DEFERRED = 3;
067: /**
068: * Host has been encountered and all availible URIs for it have been
069: * processed already. More URIs may become availible later or not.
070: * Inactive hosts may eventually become 'forgotten'.
071: */
072: public static final int HOST_INACTIVE = 4;
073:
074: /**
075: * Total number of hosts that are currently active.
076: *
077: * <p>Active hosts are considered to be those that are ready, deferred or
078: * in process.
079: *
080: * @return Total number of hosts that are currently active.
081: */
082: public int activeHosts();
083:
084: /**
085: * Total number of inactive hosts.
086: *
087: * <p>Inactive hosts are those hosts that have been active but have now been
088: * exhausted and contain no more additional URIs.
089: *
090: * @return Total number of inactive hosts.
091: */
092: public int inactiveHosts();
093:
094: /**
095: * Total number of deferred hosts.
096: *
097: * <p>Deferred hosts are currently active hosts that have been deferred
098: * from processing for the time being (becausee of politeness or waiting
099: * before retrying.
100: *
101: * @return Total number of deferred hosts.
102: */
103: public int deferredHosts();
104:
105: /**
106: * Total number of hosts with URIs in process.
107: *
108: * <p>It is generally assumed that each host can have only 1 URI in
109: * process at the same time. However some frontiers may implement
110: * politeness differently meaning that the same host is both ready and
111: * in process. {@link #activeHosts() activeHosts()} will not count them
112: * twice though.
113: *
114: * @return Total number of hosts with URIs in process.
115: */
116: public int inProcessHosts();
117:
118: /**
119: * Total number of hosts that have a URI ready for processing.
120: *
121: * @return Total number of hosts that have a URI ready for processing.
122: */
123: public int readyHosts();
124:
125: /**
126: * Get the status of a host.
127: *
128: * <p>Hosts can be in one of the following states:
129: * <ul>
130: * <li> {@link #HOST_READY Ready}
131: * <li> {@link #HOST_INPROCESS In process}
132: * <li> {@link #HOST_DEFERRED deferred}
133: * <li> {@link #HOST_INACTIVE Inactive}
134: * <li> {@link #HOST_UNKNOWN Unknown}
135: * </ul>
136: *
137: * <p>Some Frontiers may allow a host to have more then one URI in process
138: * at the same time. In those cases it will be reported as
139: * {@link #HOST_READY Ready} as long as it is has more URIs ready for
140: * processing. Only once it has no more possible URIs for processing will
141: * it be reported as {@link #HOST_INPROCESS In process}
142: *
143: * @param host The name of the host to lookup the status for.
144: * @return The status of the specified host.
145: *
146: * @see #HOST_DEFERRED
147: * @see #HOST_INACTIVE
148: * @see #HOST_INPROCESS
149: * @see #HOST_READY
150: * @see #HOST_UNKNOWN
151: */
152: public int hostStatus(String host);
153:
154: }
|