001: /*
002: * WARCConstants
003: *
004: * $Id: WARCConstants.java 4976 2007-03-09 13:59:07Z gojomo $
005: *
006: * Created on July 27th, 2006
007: *
008: * Copyright (C) 2006 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026: package org.archive.io.warc;
027:
028: import java.util.Arrays;
029: import java.util.List;
030:
031: import org.archive.io.ArchiveFileConstants;
032:
033: /**
034: * WARC Constants used by WARC readers and writers.
035: * Below constants are used by version 0.10 and 0.12 of WARC Reader/Writer.
036: * @author stack
037: * @version $Revision: 4976 $ $Date: 2007-03-09 13:59:07 +0000 (Fri, 09 Mar 2007) $
038: */
039: public interface WARCConstants extends ArchiveFileConstants {
040: /**
041: * Default maximum WARC file size.
042: * 1Gig.
043: */
044: public static final int DEFAULT_MAX_WARC_FILE_SIZE = 1024 * 1024 * 1024;
045:
046: /**
047: * WARC MAGIC
048: * WARC files and records begin with this sequence.
049: */
050: public static final String WARC_MAGIC = "WARC/";
051: public static final String WARC_010_MAGIC = "WARC/";
052:
053: /**
054: * Hard-coded version for WARC files made with this code.
055: * Setting to 0.10 because differs from 0.9 spec. See accompanying
056: * package documentation.
057: */
058: public static final String WARC_VERSION = "0.12";
059:
060: /**
061: * Assumed maximum size of a Header Line.
062: *
063: * This 100k which seems massive but its the same as the LINE_LENGTH from
064: * <code>alexa/include/a_arcio.h</code>:
065: * <pre>
066: * #define LINE_LENGTH (100*1024)
067: * </pre>
068: */
069: public static final int MAX_WARC_HEADER_LINE_LENGTH = 1024 * 100;
070: public static final int MAX_LINE_LENGTH = MAX_WARC_HEADER_LINE_LENGTH;
071:
072: /**
073: * WARC file extention.
074: */
075: public static final String WARC_FILE_EXTENSION = "warc";
076:
077: /**
078: * Dot WARC file extension.
079: */
080: public static final String DOT_WARC_FILE_EXTENSION = "."
081: + WARC_FILE_EXTENSION;
082:
083: public static final String DOT_COMPRESSED_FILE_EXTENSION = ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
084:
085: /**
086: * Compressed WARC file extension.
087: */
088: public static final String COMPRESSED_WARC_FILE_EXTENSION = WARC_FILE_EXTENSION
089: + DOT_COMPRESSED_FILE_EXTENSION;
090:
091: /**
092: * Compressed dot WARC file extension.
093: */
094: public static final String DOT_COMPRESSED_WARC_FILE_EXTENSION = DOT_WARC_FILE_EXTENSION
095: + DOT_COMPRESSED_FILE_EXTENSION;
096:
097: /**
098: * Encoding to use getting bytes from strings.
099: *
100: * Specify an encoding rather than leave it to chance: i.e whatever the
101: * JVMs encoding. Use an encoding that gets the stream as bytes, not chars.
102: *
103: * <p>TODO: ARC uses ISO-8859-1. In general, we should use UTF-8 but we
104: * probably need a single byte encoding if we're out for preserving the
105: * binary data as received over the net (We probably don't want to transform
106: * the supra-ASCII characters to UTF-8 before storing in ARC). For now,
107: * till we figure it, DEFAULT_ENCODING is single-byte charset -- same as
108: * ARCs.
109: */
110: public static final String DEFAULT_ENCODING = "ISO-8859-1";
111: public static final String HEADER_LINE_ENCODING = DEFAULT_ENCODING;
112:
113: // TODO: Revisit. 8859 isn't correct, especially if we settle on RFC822
114: // headers
115: public static final String WARC_HEADER_ENCODING = HEADER_LINE_ENCODING;
116:
117: public static final String[] HEADER_FIELD_KEYS = {
118: VERSION_FIELD_KEY, LENGTH_FIELD_KEY, TYPE_FIELD_KEY,
119: URL_FIELD_KEY, DATE_FIELD_KEY, RECORD_IDENTIFIER_FIELD_KEY,
120: MIMETYPE_FIELD_KEY };
121:
122: /**
123: * WARC Record Types.
124: */
125: public static final String WARCINFO = "warcinfo";
126: public static final String RESPONSE = "response";
127: public static final String RESOURCE = "resource";
128: public static final String REQUEST = "request";
129: public static final String METADATA = "metadata";
130: public static final String REVISIT = "revisit";
131: public static final String CONVERSION = "conversion";
132: public static final String CONTINUATION = "continuation";
133:
134: public static final String TYPE = "type";
135:
136: // List of all WARC Record TYPES
137: public static final String[] TYPES = { WARCINFO, RESPONSE,
138: RESOURCE, REQUEST, METADATA, REVISIT, CONVERSION,
139: CONTINUATION };
140:
141: // Indices into TYPES array.
142: public static final int WARCINFO_INDEX = 0;
143: public static final int RESPONSE_INDEX = 1;
144: public static final int RESOURCE_INDEX = 2;
145: public static final int REQUEST_INDEX = 3;
146: public static final int METADATA_INDEX = 4;
147: public static final int REVISIT_INDEX = 5;
148: public static final int CONVERSION_INDEX = 6;
149: public static final int CONTINUATION_INDEX = 7;
150:
151: // TYPES as List.
152: public static final List TYPES_LIST = Arrays.asList(TYPES);
153:
154: /**
155: * WARC-ID
156: */
157: public static final String WARC_ID = WARC_MAGIC + WARC_VERSION;
158: public static final String WARC_010_ID = WARC_010_MAGIC + "0.10";
159:
160: /**
161: * Header field seperator character.
162: */
163: public static final char HEADER_FIELD_SEPARATOR = ' ';
164:
165: /**
166: * WSP
167: * One of a space or horizontal tab character.
168: * TODO: WSP undefined. Fix.
169: */
170: public static final Character[] WSP = { HEADER_FIELD_SEPARATOR,
171: '\t' };
172:
173: /**
174: * Placeholder for length in Header line.
175: * Placeholder is same size as the fixed field size allocated for length,
176: * 12 characters. 12 characters allows records of size almost 1TB.
177: */
178: public static final String PLACEHOLDER_RECORD_LENGTH_STRING = "000000000000";
179:
180: public static final String NAMED_FIELD_IP_LABEL = "IP-Address";
181: public static final String NAMED_FIELD_CHECKSUM_LABEL = "Checksum";
182: public static final String NAMED_FIELD_RELATED_LABEL = "References";
183: public static final String NAMED_FIELD_WARCFILENAME = "Filename";
184: public static final String NAMED_FIELD_DESCRIPTION = "Description";
185: public static final String NAMED_FIELD_FILEDESC = "ARC-FileDesc";
186: public static final String NAMED_FIELD_TRUNCATED = "Truncated";
187: public static final String NAMED_FIELD_TRUNCATED_VALUE_TIME = "time";
188: public static final String NAMED_FIELD_TRUNCATED_VALUE_LEN = "length";
189: public static final String NAMED_FIELD_TRUNCATED_VALUE_HEAD = "long-headers";
190: public static final String NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED = null;
191:
192: // Headers new to version 0.12 of spec.
193: public static final String HEADER_KEY_URI = "WARC-Subject-URI";
194: public static final String HEADER_KEY_DATE = "WARC-Date";
195: public static final String HEADER_KEY_TYPE = "WARC-Type";
196: public static final String HEADER_KEY_IP = "WARC-IP-Address";
197: public static final String HEADER_KEY_CHECKSUM = "WARC-Content-Digest";
198: public static final String HEADER_KEY_CONCURRENT_TO = "WARC-Concurrent-To";
199: public static final String HEADER_KEY_TRUNCATED = "WARC-Truncated";
200: public static final String HEADER_KEY_PROFILE = "WARC-Profile";
201: public static final String HEADER_KEY_FILENAME = "WARC-Filename";
202: public static final String HEADER_KEY_ETAG = "WARC-Etag";
203: public static final String HEADER_KEY_LAST_MODIFIED = "WARC-Last-Modified";
204:
205: public static final String PROFILE_REVISIT_IDENTICAL_DIGEST = "http://netpreserve.org/warc/0.12/revisit/identical-content-digest";
206: public static final String PROFILE_REVISIT_NOT_MODIFIED = "http://netpreserve.org/warc/0.12/revisit/server-not-modified";
207: public static final String PROFILE_CONVERSION_SOFTWARE_COMMAND = "http://netpreserve.org/warc/0.12/conversion/software-command";
208:
209: public static final String MIME_VERSION = "MIME-Version: 1.0";
210: public static final String CONTENT_ID = "Content-ID";
211: public static final String CONTENT_LENGTH = "Content-Length";
212: public static final String CONTENT_TYPE = "Content-Type";
213: public static final String CONTENT_DESCRIPTION = "Content-Description";
214:
215: public static final String COLON_SPACE = ": ";
216: // TODO: This is not in spec. Fix.
217: public static final String TRUNCATED_VALUE_UNSPECIFIED = "unspecified";
218:
219: /**
220: * To be safe, lets use application type rather than message. Regards
221: * 'message/http', RFC says "...provided that it obeys the MIME restrictions
222: * for all 'message' types regarding line length and encodings." This
223: * usually means lines of 1000 octets max (unless a
224: * 'Content-Transfer-Encoding: binary' mime header is present).
225: * @see <a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html#sec19.1">rfc2616 section 19.1</a>
226: */
227: public static final String HTTP_REQUEST_MIMETYPE = "application/http; msgtype=request";
228: public static final String HTTP_RESPONSE_MIMETYPE = "application/http; msgtype=response";
229: }
|