001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: /**
036: * Download parameters. These parameters are limits on
037: * how Page can download a Link. A Crawler has a
038: * default set of download parameters, but the defaults
039: * can be overridden on individual links by calling
040: * Link.setDownloadParameters().
041: * <P>
042: * DownloadParameters is an immutable class (like String).
043: * "Changing" a parameter actually returns a new instance
044: * of the class with only the specified parameter changed.
045: */
046: public class DownloadParameters implements Cloneable
047: //#ifdef JDK1.1
048: , java.io.Serializable
049: //#endif JDK1.1
050: {
051: private int maxThreads = 4;
052: // number of background threads used by the crawler
053: private int maxPageSize = 100;
054: // maximum page size in kilobytes (-1 for no maximum)
055: private int downloadTimeout = 60;
056: // timeout for a single page, in seconds (-1 for no timeout)
057: private int crawlTimeout = -1;
058: // timeout for entire crawl in seconds (-1 for no timeout)
059: private boolean obeyRobotExclusion = false;
060: // obey crawling rules in robots.txt
061:
062: // not implemented yet
063: // private int maxRequestsPerServer = 2;
064: // // maximum number of simultaneous requests to a server (-1 for no maximum)
065: // private int delay = 500;
066: // // delay (in milliseconds) between starts of requests to same server (0 for no delay)
067:
068: private boolean interactive = true;
069: // user is available to answer dialog boxes, e.g. for authentication
070: private boolean useCaches = true;
071: // use cached pages to satisfy requests wherever possible
072: private String acceptedMIMETypes = null;
073: // accept header for HTTP request, or null to use default
074: private String userAgent = null;
075: // User-Agent header for HTTP request, or null to use default
076:
077: public static final DownloadParameters DEFAULT = new DownloadParameters();
078: public static final DownloadParameters NO_LIMITS = DEFAULT
079: .changeMaxPageSize(-1).changeDownloadTimeout(-1)
080: .changeCrawlTimeout(-1);
081:
082: /**
083: * Make a DownloadParameters object with default settigns.
084: */
085: public DownloadParameters() {
086: }
087:
088: /**
089: * Clone a DownloadParameters object.
090: */
091: public Object clone() {
092: try {
093: return super .clone();
094: } catch (CloneNotSupportedException e) {
095: throw new RuntimeException("Internal error: " + e);
096: }
097: }
098:
099: /**
100: * Get maximum threads.
101: * @return maximum number of background threads used by crawler.
102: * Default is 4.
103: */
104: public int getMaxThreads() {
105: return maxThreads;
106: }
107:
108: /**
109: * Set maximum threads.
110: * @param maxthreads maximum number of background threads used by crawler
111: * @return new DownloadParameters object with the specified parameter changed.
112: */
113: public DownloadParameters changeMaxThreads(int maxthreads) {
114: DownloadParameters dp = (DownloadParameters) clone();
115: dp.maxThreads = maxthreads;
116: return dp;
117: }
118:
119: /**
120: * Get maximum page size. Pages larger than this limit are neither
121: * downloaded nor parsed.
122: * Default value is 100 (KB). 0 or negative values mean no limit.
123: * @return maximum page size in kilobytes
124: */
125: public int getMaxPageSize() {
126: return maxPageSize;
127: }
128:
129: /**
130: * Change maximum page size. Pages larger than this limit are treated as
131: * leaves in the crawl graph -- neither downloaded nor parsed.
132: * @param maxPageSize maximum page size in kilobytes
133: * @return new DownloadParameters object with the specified parameter changed.
134: */
135: public DownloadParameters changeMaxPageSize(int maxPageSize) {
136: DownloadParameters dp = (DownloadParameters) clone();
137: dp.maxPageSize = maxPageSize;
138: return dp;
139: }
140:
141: /**
142: * Get download timeout value.
143: * @return length of time (in seconds) that crawler will wait for a page to download
144: * before aborting it.
145: * timeout. Default is 60 seconds.
146: */
147: public int getDownloadTimeout() {
148: return downloadTimeout;
149: }
150:
151: /**
152: * Change download timeout value.
153: * @param timeout length of time (in seconds) to wait for a page to download
154: * Use a negative value to turn off timeout.
155: * @return new DownloadParameters object with the specified parameter changed.
156: */
157: public DownloadParameters changeDownloadTimeout(int timeout) {
158: DownloadParameters dp = (DownloadParameters) clone();
159: dp.downloadTimeout = timeout;
160: return dp;
161: }
162:
163: /**
164: * Get timeout on entire crawl.
165: * @return maximum length of time (in seconds) that crawler will run
166: * before aborting. Default is -1 (no limit).
167: */
168: public int getCrawlTimeout() {
169: return crawlTimeout;
170: }
171:
172: /**
173: * Change timeout value.
174: * @param timeout maximum length of time (in seconds) that crawler will run.
175: * Use a negative value to turn off timeout.
176: * @return new DownloadParameters object with the specified parameter changed.
177: */
178: public DownloadParameters changeCrawlTimeout(int timeout) {
179: DownloadParameters dp = (DownloadParameters) clone();
180: dp.crawlTimeout = timeout;
181: return dp;
182: }
183:
184: /**
185: * Get obey-robot-exclusion flag.
186: * @return true iff the
187: * crawler checks robots.txt on the remote Web site
188: * before downloading a page. Default is false.
189: */
190: public boolean getObeyRobotExclusion() {
191: return obeyRobotExclusion;
192: }
193:
194: /**
195: * Change obey-robot-exclusion flag.
196: * @param f If true, then the
197: * crawler checks robots.txt on the remote Web site
198: * before downloading a page.
199: * @return new DownloadParameters object with the specified parameter changed.
200: */
201: public DownloadParameters changeObeyRobotExclusion(boolean f) {
202: DownloadParameters dp = (DownloadParameters) clone();
203: dp.obeyRobotExclusion = f;
204: return dp;
205: }
206:
207: /**
208: * Get interactive flag.
209: * @return true if a user is available to respond to
210: * dialog boxes (for instance, to enter passwords for
211: * authentication). Default is true.
212: */
213: public boolean getInteractive() {
214: return interactive;
215: }
216:
217: /**
218: * Change interactive flag.
219: * @param f true if a user is available to respond
220: * to dialog boxes
221: * @return new DownloadParameters object with the specified parameter changed.
222: */
223: public DownloadParameters changeInteractive(boolean f) {
224: DownloadParameters dp = (DownloadParameters) clone();
225: dp.interactive = f;
226: return dp;
227: }
228:
229: /**
230: * Get use-caches flag.
231: * @return true if cached pages should be used whenever
232: * possible
233: */
234: public boolean getUseCaches() {
235: return useCaches;
236: }
237:
238: /**
239: * Change use-caches flag.
240: * @param f true if cached pages should be used whenever possible
241: * @return new DownloadParameters object with the specified parameter changed.
242: */
243: public DownloadParameters changeUseCaches(boolean f) {
244: DownloadParameters dp = (DownloadParameters) clone();
245: dp.useCaches = f;
246: return dp;
247: }
248:
249: /**
250: * Get accepted MIME types.
251: * @return list of MIME types that can be handled by
252: * the crawler (which are passed as the Accept header
253: * in the HTTP request).
254: * Default is null.
255: */
256: public String getAcceptedMIMETypes() {
257: return acceptedMIMETypes;
258: }
259:
260: /**
261: * Change accepted MIME types.
262: * @param types list of MIME types that can be handled
263: * by the crawler. Use null if the crawler can handle anything.
264: * @return new DownloadParameters object with the specified parameter changed.
265: */
266: public DownloadParameters changeAcceptedMIMETypes(String types) {
267: DownloadParameters dp = (DownloadParameters) clone();
268: dp.acceptedMIMETypes = types;
269: return dp;
270: }
271:
272: /**
273: * Get User-agent header used in HTTP requests.
274: * @return user-agent field used in HTTP requests,
275: * or null if the Java library's default user-agent
276: * is used. Default value is null (but for a Crawler,
277: * the default DownloadParameters has the Crawler's
278: * name as its default user-agent).
279: */
280: public String getUserAgent() {
281: return userAgent;
282: }
283:
284: /**
285: * Change User-agent field used in HTTP requests.
286: * @param userAgent user-agent field used in HTTP
287: * requests. Pass null to use the Java library's default
288: * user-agent field.
289: * @return new DownloadParameters object with the specified parameter changed.
290: */
291: public DownloadParameters changeUserAgent(String userAgent) {
292: DownloadParameters dp = (DownloadParameters) clone();
293: dp.userAgent = userAgent;
294: return dp;
295: }
296: }
|