001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * CoreAttributeConstants.java
020: * Created on Jun 17, 2003
021: *
022: * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src//**
023: * @author gojomo
024: *
025: */
026: package org.archive.crawler.datamodel;
027:
028: /**
029: * CrawlURI attribute keys used by the core crawler
030: * classes.
031: *
032: * @author gojomo
033: *
034: */
035: public interface CoreAttributeConstants {
036:
037: /**
038: * Extracted MIME type of fetched content; should be
039: * set immediately by fetching module if possible
040: * (rather than waiting for a later analyzer)
041: */
042: public static String A_CONTENT_TYPE = "content-type";
043:
044: /**
045: * Multiplier of last fetch duration to wait before
046: * fetching another item of the same class (eg host)
047: */
048: public static String A_DELAY_FACTOR = "delay-factor";
049: /**
050: * Minimum delay before fetching another item of th
051: * same class (eg host). Even if lastFetchTime*delayFactor
052: * is less than this, this period will be waited.
053: */
054: public static String A_MINIMUM_DELAY = "minimum-delay";
055:
056: public static String A_RRECORD_SET_LABEL = "dns-records";
057: public static String A_DNS_FETCH_TIME = "dns-fetch-time";
058: public static String A_DNS_SERVER_IP_LABEL = "dns-server-ip";
059: public static String A_FETCH_BEGAN_TIME = "fetch-began-time";
060: public static String A_FETCH_COMPLETED_TIME = "fetch-completed-time";
061: public static String A_HTTP_TRANSACTION = "http-transaction";
062:
063: public static String A_RUNTIME_EXCEPTION = "runtime-exception";
064: public static String A_LOCALIZED_ERRORS = "localized-errors";
065:
066: /** shorthand string tokens indicating notable occurences,
067: * separated by commas */
068: public static String A_ANNOTATIONS = "annotations";
069:
070: public static String A_PREREQUISITE_URI = "prerequisite-uri";
071: public static String A_DISTANCE_FROM_SEED = "distance-from-seed";
072: public static String A_HTML_BASE = "html-base-href";
073: public static String A_RETRY_DELAY = "retry-delay";
074:
075: public static String A_META_ROBOTS = "meta-robots";
076: /**
077: * Define for org.archive.crawler.writer.MirrorWriterProcessor.
078: */
079: public static String A_MIRROR_PATH = "mirror-path";
080:
081: /**
082: * Key to get credential avatars from A_LIST.
083: */
084: public static final String A_CREDENTIAL_AVATARS_KEY = "credential-avatars";
085:
086: /** a 'source' (usu. URI) that's inherited by discovered URIs */
087: public static String A_SOURCE_TAG = "source";
088:
089: /**
090: * Key to (optional) attribute specifying a list of keys that
091: * are passed to CandidateURIs that 'descend' (are discovered
092: * via) this URI.
093: */
094: public static final String A_HERITABLE_KEYS = "heritable";
095:
096: /** flag indicating the containing queue should be retired */
097: public static final String A_FORCE_RETIRE = "force-retire";
098:
099: /** local override of proxy host */
100: public static final String A_HTTP_PROXY_HOST = "http-proxy-host";
101: /** local override of proxy port */
102: public static final String A_HTTP_PROXY_PORT = "http-proxy-port";
103:
104: /**
105: * Fetch truncation codes present in {@link CrawlURI} annotations.
106: * All truncation annotations have a <code>TRUNC_SUFFIX</code> suffix (TODO:
107: * Make for-sure unique or redo truncation so definitive flag marked
108: * against {@link CrawlURI}).
109: */
110: public static final String TRUNC_SUFFIX = "Trunc";
111: // headerTrunc
112: public static final String HEADER_TRUNC = "header" + TRUNC_SUFFIX;
113: // timeTrunc
114: public static final String TIMER_TRUNC = "time" + TRUNC_SUFFIX;
115: // lenTrunc
116: public static final String LENGTH_TRUNC = "len" + TRUNC_SUFFIX;
117:
118: /* Duplication-reduction / recrawl / history constants */
119:
120: /** fetch history array */
121: public static final String A_FETCH_HISTORY = "fetch-history";
122: /** content digest */
123: public static final String A_CONTENT_DIGEST = "content-digest";
124: /** header name (and AList key) for last-modified timestamp */
125: public static final String A_LAST_MODIFIED_HEADER = "last-modified";
126: /** header name (and AList key) for ETag */
127: public static final String A_ETAG_HEADER = "etag";
128: /** key for status (when in history) */
129: public static final String A_STATUS = "status";
130: /** reference length (content length or virtual length */
131: public static final String A_REFERENCE_LENGTH = "reference-length";
132:
133: }
|