001: /*
002: * CrawlOrder
003: *
004: * $Header$
005: *
006: * Created on May 15, 2003
007: *
008: * Copyright (C) 2003 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: *
026: */
027:
028: package org.archive.crawler.datamodel;
029:
030: import java.io.File;
031: import java.io.Serializable;
032: import java.util.logging.Logger;
033:
034: import javax.management.AttributeNotFoundException;
035:
036: import org.archive.crawler.framework.CrawlController;
037: import org.archive.crawler.framework.CrawlScope;
038: import org.archive.crawler.framework.Frontier;
039: import org.archive.crawler.framework.Processor;
040: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
041: import org.archive.crawler.settings.MapType;
042: import org.archive.crawler.settings.ModuleType;
043: import org.archive.crawler.settings.SimpleType;
044: import org.archive.crawler.settings.Type;
045: import org.archive.crawler.url.canonicalize.BaseRule;
046:
047: /**
048: * Represents the 'root' of the settings hierarchy. Contains those settings that
049: * do not belong to any specific module, but rather relate to the crawl as a
050: * whole (much of this is used by the CrawlController directly or indirectly).
051: *
052: * @see org.archive.crawler.settings.ModuleType
053: */
054: public class CrawlOrder extends ModuleType implements Serializable {
055:
056: private static final long serialVersionUID = -6715840285961511669L;
057:
058: private static Logger logger = Logger
059: .getLogger("org.archive.crawler.datamodel.CrawlOrder");
060:
061: public static final String ATTR_NAME = "crawl-order";
062: public static final String ATTR_SETTINGS_DIRECTORY = "settings-directory";
063: public static final String ATTR_DISK_PATH = "disk-path";
064: public static final String ATTR_LOGS_PATH = "logs-path";
065: public static final String ATTR_CHECKPOINTS_PATH = "checkpoints-path";
066: public static final String ATTR_STATE_PATH = "state-path";
067: public static final String ATTR_SCRATCH_PATH = "scratch-path";
068: public static final String ATTR_RECOVER_PATH = "recover-path";
069: public static final String ATTR_RECOVER_RETAIN_FAILURES = "recover-retain-failures";
070: public static final String ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download";
071: public static final String ATTR_MAX_DOCUMENT_DOWNLOAD = "max-document-download";
072: public static final String ATTR_MAX_TIME_SEC = "max-time-sec";
073: public static final String ATTR_MAX_TOE_THREADS = "max-toe-threads";
074: public static final String ATTR_HTTP_HEADERS = "http-headers";
075: public static final String ATTR_USER_AGENT = "user-agent";
076: public static final String ATTR_FROM = "from";
077: public static final String ATTR_PRE_FETCH_PROCESSORS = "pre-fetch-processors";
078: public static final String ATTR_FETCH_PROCESSORS = "fetch-processors";
079: public static final String ATTR_EXTRACT_PROCESSORS = "extract-processors";
080: public static final String ATTR_WRITE_PROCESSORS = "write-processors";
081: public static final String ATTR_POST_PROCESSORS = "post-processors";
082: public static final String ATTR_LOGGERS = "loggers";
083: public static final String ATTR_RULES = "uri-canonicalization-rules";
084: public static final String ATTR_RECORDER_OUT_BUFFER = "recorder-out-buffer-bytes";
085: public static final String ATTR_RECORDER_IN_BUFFER = "recorder-in-buffer-bytes";
086:
087: /** Percentage of heap to allocate to bdb cache */
088: public static final String ATTR_BDB_CACHE_PERCENT = "bdb-cache-percent";
089:
090: /**
091: * When checkpointing, copy the bdb logs.
092: * Default is true. If false, then we do not copy logs on checkpoint AND
093: * we tell bdbje never to delete log files; instead it renames
094: * files-to-delete with a '.del' extension. Assumption is that when this
095: * setting is false, an external process is managing the removing of
096: * bdbje log files and that come time to recover from a checkpoint, the
097: * files that comprise a checkpoint are manually assembled.
098: */
099: public static final String ATTR_CHECKPOINT_COPY_BDBJE_LOGS = "checkpoint-copy-bdbje-logs";
100: public static final Boolean DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS = Boolean.TRUE;
101:
102: /**
103: * Default size of bdb cache.
104: */
105: private final static Integer DEFAULT_BDB_CACHE_PERCENT = new Integer(
106: 0);
107:
108: private transient MapType httpHeaders;
109: private transient MapType loggers;
110:
111: private transient CrawlController controller;
112:
113: /**
114: * Regex for acceptable user-agent format.
115: */
116: private static String ACCEPTABLE_USER_AGENT = "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*";
117:
118: /**
119: * Regex for acceptable from address.
120: */
121: private static String ACCEPTABLE_FROM = "\\S+@\\S+\\.\\S+";
122:
123: /** Construct a CrawlOrder.
124: */
125: public CrawlOrder() {
126: super (ATTR_NAME,
127: "Heritrix crawl order. This forms the root of "
128: + "the settings framework.");
129: Type e;
130:
131: e = addElementToDefinition(new SimpleType(
132: ATTR_SETTINGS_DIRECTORY,
133: "Directory where override settings are kept. The settings "
134: + "for many modules can be overridden based on the domain or "
135: + "subdomain of the URI being processed. This setting specifies"
136: + " a file level directory to store those settings. The path"
137: + " is relative to 'disk-path' unless"
138: + " an absolute path is provided.", "settings"));
139: e.setOverrideable(false);
140: e.setExpertSetting(true);
141:
142: e = addElementToDefinition(new SimpleType(
143: ATTR_DISK_PATH,
144: "Directory where logs, arcs and other run time files will "
145: + "be kept. If this path is a relative path, it will be "
146: + "relative to the crawl order.", ""));
147: e.setOverrideable(false);
148: e.setExpertSetting(true);
149:
150: e = addElementToDefinition(new SimpleType(
151: ATTR_LOGS_PATH,
152: "Directory where crawler log files will be kept. If this path "
153: + "is a relative path, it will be relative to the 'disk-path'.",
154: "logs"));
155: e.setOverrideable(false);
156: e.setExpertSetting(true);
157:
158: e = addElementToDefinition(new SimpleType(
159: ATTR_CHECKPOINTS_PATH,
160: "Directory where crawler checkpoint files will be kept. "
161: + "If this path "
162: + "is a relative path, it will be relative to the 'disk-path'.",
163: "checkpoints"));
164: e.setOverrideable(false);
165: e.setExpertSetting(true);
166:
167: e = addElementToDefinition(new SimpleType(
168: ATTR_STATE_PATH,
169: "Directory where crawler-state files will be kept. If this path "
170: + "is a relative path, it will be relative to the 'disk-path'.",
171: "state"));
172: e.setOverrideable(false);
173: e.setExpertSetting(true);
174:
175: e = addElementToDefinition(new SimpleType(
176: ATTR_SCRATCH_PATH,
177: "Directory where discardable temporary files will be kept. "
178: + "If this path "
179: + "is a relative path, it will be relative to the 'disk-path'.",
180: "scratch"));
181: e.setOverrideable(false);
182: e.setExpertSetting(true);
183:
184: e = addElementToDefinition(new SimpleType(
185: ATTR_MAX_BYTES_DOWNLOAD,
186: "Maximum number of bytes to download. Once this number is"
187: + " exceeded the crawler will stop. "
188: + "A value of zero means no upper limit.",
189: new Long(0)));
190: e.setOverrideable(false);
191:
192: e = addElementToDefinition(new SimpleType(
193: ATTR_MAX_DOCUMENT_DOWNLOAD,
194: "Maximum number of documents to download. Once this number"
195: + " is exceeded the crawler will stop. "
196: + "A value of zero means no upper limit.",
197: new Long(0)));
198: e.setOverrideable(false);
199:
200: e = addElementToDefinition(new SimpleType(
201: ATTR_MAX_TIME_SEC,
202: "Maximum amount of time to crawl (in seconds). Once this"
203: + " much time has elapsed the crawler will stop. A value of"
204: + " zero means no upper limit.", new Long(0)));
205: e.setOverrideable(false);
206:
207: e = addElementToDefinition(new SimpleType(
208: ATTR_MAX_TOE_THREADS,
209: "Maximum number of threads processing URIs at the same time.",
210: new Integer(100)));
211: e.setOverrideable(false);
212:
213: e = addElementToDefinition(new SimpleType(
214: ATTR_RECORDER_OUT_BUFFER,
215: "Size in bytes of in-memory buffer to record outbound "
216: + "traffic. One such buffer is reserved for every ToeThread.",
217: new Integer(4096)));
218: e.setOverrideable(false);
219: e.setExpertSetting(true);
220:
221: e = addElementToDefinition(new SimpleType(
222: ATTR_RECORDER_IN_BUFFER,
223: "Size in bytes of in-memory buffer to record inbound "
224: + "traffic. One such buffer is reserved for every ToeThread.",
225: new Integer(65536)));
226: e.setOverrideable(false);
227: e.setExpertSetting(true);
228:
229: e = addElementToDefinition(new SimpleType(
230: ATTR_BDB_CACHE_PERCENT,
231: "Percentage of heap to allocate to BerkeleyDB JE cache. "
232: + "Default of zero means no preference (accept BDB's default, "
233: + "usually 60%, or the je.maxMemoryPercent property value).",
234: DEFAULT_BDB_CACHE_PERCENT));
235: e.setExpertSetting(true);
236: e.setOverrideable(false);
237:
238: addElementToDefinition(new CrawlScope());
239:
240: httpHeaders = (MapType) addElementToDefinition(new MapType(
241: ATTR_HTTP_HEADERS,
242: "HTTP headers. Information that will "
243: + "be used when constructing the HTTP headers of "
244: + "the crawler's HTTP requests."));
245:
246: e = httpHeaders
247: .addElementToDefinition(new SimpleType(
248: ATTR_USER_AGENT,
249: "User agent to act as. Field must contain valid URL "
250: + "that links to website of person or organization "
251: + "running the crawl. Replace 'PROJECT_URL_HERE' in "
252: + "initial template. E.g. If organization "
253: + "is Library of Congress, a valid user agent would be:"
254: + "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 "
255: + "+http://loc.gov)'. "
256: + "Note, you must preserve the '+' before the 'http'.",
257: "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));
258:
259: e = httpHeaders
260: .addElementToDefinition(new SimpleType(
261: ATTR_FROM,
262: "Contact information. This field must contain a valid "
263: + "e-mail address for the person or organization responsible"
264: + "for this crawl: e.g. 'webmaster@loc.gov'",
265: "CONTACT_EMAIL_ADDRESS_HERE"));
266:
267: addElementToDefinition(new RobotsHonoringPolicy());
268:
269: e = addElementToDefinition(new ModuleType(Frontier.ATTR_NAME,
270: "Frontier"));
271: e.setLegalValueType(Frontier.class);
272:
273: e = (MapType) addElementToDefinition(new MapType(
274: ATTR_RULES,
275: "Ordered list of url canonicalization rules. "
276: + "Rules are applied in the order listed from top to bottom.",
277: BaseRule.class));
278: e.setOverrideable(true);
279: e.setExpertSetting(true);
280:
281: e = addElementToDefinition(new MapType(
282: ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to"
283: + " fetching anything from the network.",
284: Processor.class));
285: e.setOverrideable(false);
286:
287: e = addElementToDefinition(new MapType(ATTR_FETCH_PROCESSORS,
288: "Processors that fetch documents.", Processor.class));
289: e.setOverrideable(false);
290:
291: e = addElementToDefinition(new MapType(ATTR_EXTRACT_PROCESSORS,
292: "Processors that extract new URIs"
293: + " from fetched documents.", Processor.class));
294: e.setOverrideable(false);
295:
296: e = addElementToDefinition(new MapType(ATTR_WRITE_PROCESSORS,
297: "Processors that write documents" + " to archives.",
298: Processor.class));
299: e.setOverrideable(false);
300:
301: e = addElementToDefinition(new MapType(ATTR_POST_PROCESSORS,
302: "Processors that do cleanup and feed"
303: + " the frontier with new URIs.",
304: Processor.class));
305: e.setOverrideable(false);
306:
307: loggers = (MapType) addElementToDefinition(new MapType(
308: ATTR_LOGGERS,
309: "Statistics tracking modules. Any number of specialized "
310: + "statistics tracker that monitor a crawl and write logs, "
311: + "reports and/or provide information to the user interface."));
312:
313: e = addElementToDefinition(new SimpleType(
314: ATTR_RECOVER_PATH,
315: "Optional. Points at recover log (or recover.gz log) OR "
316: + "the checkpoint directory to use recovering a crawl.",
317: ""));
318: e.setOverrideable(false);
319: e.setExpertSetting(true);
320:
321: e = addElementToDefinition(new SimpleType(
322: ATTR_CHECKPOINT_COPY_BDBJE_LOGS,
323: "When true, on a checkpoint, we copy off the bdbje log files to "
324: + "the checkpoint directory. To recover a checkpoint, just "
325: + "set the "
326: + ATTR_RECOVER_PATH
327: + " to point at the checkpoint "
328: + "directory to recover. This is default setting. "
329: + "But if crawl is large, "
330: + "copying bdbje log files can take tens of minutes and even "
331: + "upwards of an hour (Copying bdbje log files will consume bulk "
332: + "of time checkpointing). If this setting is false, we do NOT copy "
333: + "bdbje logs on checkpoint AND we set bdbje to NEVER delete log "
334: + "files (instead we have it rename files-to-delete with a '.del'"
335: + "extension). Assumption is that when this setting is false, "
336: + "an external process is managing the removal of bdbje log files "
337: + "and that come time to recover from a checkpoint, the files that "
338: + "comprise a checkpoint are manually assembled. This is an expert "
339: + "setting.",
340: DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));
341: e.setOverrideable(false);
342: e.setExpertSetting(true);
343:
344: e = addElementToDefinition(new SimpleType(
345: ATTR_RECOVER_RETAIN_FAILURES,
346: "When recovering via the recover.log, should failures "
347: + "in the log be retained in the recovered crawl, "
348: + "preventing the corresponding URIs from being retried. "
349: + "Default is false, meaning failures are forgotten, and "
350: + "the corresponding URIs will be retried in the recovered "
351: + "crawl.", Boolean.FALSE));
352: e.setOverrideable(false);
353: e.setExpertSetting(true);
354:
355: e = addElementToDefinition(new CredentialStore(
356: CredentialStore.ATTR_NAME));
357: e.setOverrideable(true);
358: e.setExpertSetting(true);
359: }
360:
361: /**
362: * @param curi
363: * @return user-agent header value to use
364: */
365: public String getUserAgent(CrawlURI curi) {
366: return ((String) httpHeaders.getUncheckedAttribute(curi,
367: ATTR_USER_AGENT));
368: }
369:
370: /**
371: * @param curi
372: * @return from header value to use
373: */
374: public String getFrom(CrawlURI curi) {
375: String res = null;
376: try {
377: res = (String) httpHeaders.getAttribute(ATTR_FROM, curi);
378: } catch (AttributeNotFoundException e) {
379: logger.severe(e.getMessage());
380: }
381: return res;
382: }
383:
384: /**
385: * Returns the set number of maximum toe threads.
386: * @return Number of maximum toe threads
387: */
388: public int getMaxToes() {
389: Integer res = null;
390: try {
391: res = (Integer) getAttribute(null, ATTR_MAX_TOE_THREADS);
392: } catch (AttributeNotFoundException e) {
393: logger.severe(e.getMessage());
394: }
395: return res.intValue();
396: }
397:
398: /**
399: * This method gets the RobotsHonoringPolicy object from the orders file.
400: *
401: * @return the new RobotsHonoringPolicy
402: */
403: public RobotsHonoringPolicy getRobotsHonoringPolicy() {
404: try {
405: return (RobotsHonoringPolicy) getAttribute(null,
406: RobotsHonoringPolicy.ATTR_NAME);
407: } catch (AttributeNotFoundException e) {
408: logger.severe(e.getMessage());
409: return null;
410: }
411: }
412:
413: /** Get the name of the order file.
414: *
415: * @return the name of the order file.
416: */
417: public String getCrawlOrderName() {
418: return getSettingsHandler().getSettingsObject(null).getName();
419: }
420:
421: /**
422: * @return The crawl controller.
423: */
424: public CrawlController getController() {
425: return controller;
426: }
427:
428: /**
429: * @param controller
430: */
431: public void setController(CrawlController controller) {
432: this .controller = controller;
433: }
434:
435: /**
436: * Returns the Map of the StatisticsTracking modules that are included in the
437: * configuration that the current instance of this class is representing.
438: * @return Map of the StatisticsTracking modules
439: */
440: public MapType getLoggers() {
441: return loggers;
442: }
443:
444: /**
445: * Checks if the User Agent and From field are set 'correctly' in
446: * the specified Crawl Order.
447: *
448: * @throws FatalConfigurationException
449: */
450: public void checkUserAgentAndFrom()
451: throws FatalConfigurationException {
452: // don't start the crawl if they're using the default user-agent
453: String userAgent = this .getUserAgent(null);
454: String from = this .getFrom(null);
455: if (!(userAgent.matches(ACCEPTABLE_USER_AGENT) && from
456: .matches(ACCEPTABLE_FROM))) {
457: throw new FatalConfigurationException(
458: "unacceptable user-agent "
459: + " or from (Reedit your order file).");
460: }
461: }
462:
463: /**
464: * @return Checkpoint directory.
465: */
466: public File getCheckpointsDirectory() {
467: try {
468: return getDirectoryRelativeToDiskPath((String) getAttribute(
469: null, CrawlOrder.ATTR_CHECKPOINTS_PATH));
470: } catch (AttributeNotFoundException e) {
471: // TODO Auto-generated catch block
472: e.printStackTrace();
473: return null;
474: }
475: }
476:
477: private File getDirectoryRelativeToDiskPath(String subpath) {
478: File disk;
479: try {
480: disk = getSettingsHandler()
481: .getPathRelativeToWorkingDirectory(
482: (String) getAttribute(null,
483: CrawlOrder.ATTR_DISK_PATH));
484: return new File(disk, subpath);
485: } catch (AttributeNotFoundException e) {
486: // TODO Auto-generated catch block
487: e.printStackTrace();
488: return null;
489: }
490: }
491:
492: /**
493: * Return fullpath to the directory named by <code>key</code>
494: * in settings.
495: * If directory does not exist, it and all intermediary dirs
496: * will be created.
497: * @param key Key to use going to settings.
498: * @return Full path to directory named by <code>key</code>.
499: * @throws AttributeNotFoundException
500: */
501: public File getSettingsDir(String key)
502: throws AttributeNotFoundException {
503: String path = (String) getAttribute(null, key);
504: File f = new File(path);
505: if (!f.isAbsolute()) {
506: f = getDirectoryRelativeToDiskPath(path);
507: }
508: if (!f.exists()) {
509: f.mkdirs();
510: }
511: return f;
512: }
513:
514: }
|