001: /*
002: * ARCConstants
003: *
004: * $Id: ARCConstants.java 5029 2007-03-29 23:53:50Z gojomo $
005: *
006: * Created on Dec 30, 2003.
007: *
008: * Copyright (C) 2003 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026: package org.archive.io.arc;
027:
028: import java.util.Arrays;
029: import java.util.List;
030: import java.util.zip.Deflater;
031: import java.util.zip.GZIPInputStream;
032:
033: import org.archive.io.ArchiveFileConstants;
034: import org.archive.io.GzipHeader;
035:
036: /**
037: * Constants used by ARC files and in ARC file processing.
038: *
039: * @author stack
040: */
041: public interface ARCConstants extends ArchiveFileConstants {
042: /**
043: * Default maximum ARC file size.
044: */
045: public static final long DEFAULT_MAX_ARC_FILE_SIZE = 100000000;
046:
047: /**
048: * Maximum length for a metadata line.
049: */
050: public static final int MAX_METADATA_LINE_LENGTH = (4 * 1024);
051:
052: /**
053: * ARC file extention.
054: */
055: public static final String ARC_FILE_EXTENSION = "arc";
056:
057: /**
058: * Dot ARC file extension.
059: */
060: public static final String DOT_ARC_FILE_EXTENSION = "."
061: + ARC_FILE_EXTENSION;
062:
063: public static final String DOT_COMPRESSED_FILE_EXTENSION = ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
064:
065: /**
066: * Compressed arc file extension.
067: */
068: public static final String COMPRESSED_ARC_FILE_EXTENSION = ARC_FILE_EXTENSION
069: + DOT_COMPRESSED_FILE_EXTENSION;
070:
071: /**
072: * Compressed dot arc file extension.
073: */
074: public static final String DOT_COMPRESSED_ARC_FILE_EXTENSION = DOT_ARC_FILE_EXTENSION
075: + DOT_COMPRESSED_FILE_EXTENSION;
076:
077: /**
078: * Encoding to use getting bytes from strings.
079: *
080: * Specify an encoding rather than leave it to chance: i.e whatever the
081: * JVMs encoding. Use an encoding that gets the stream as bytes, not chars.
082: */
083: public static final String DEFAULT_ENCODING = "ISO-8859-1";
084:
085: /**
086: * ARC file line seperator character.
087: *
088: * This is what the alexa c-code looks for delimiting lines.
089: */
090: public static final char LINE_SEPARATOR = '\n';
091:
092: /**
093: * ARC header field seperator character.
094: */
095: public static final char HEADER_FIELD_SEPARATOR = ' ';
096:
097: /**
098: * ARC file *MAGIC NUMBER*.
099: *
100: * Every ARC file must begin w/ this.
101: */
102: public static final String ARC_MAGIC_NUMBER = "filedesc://";
103:
104: /**
105: * The FLG.FEXTRA field that is added to ARC files. (See RFC1952 to
106: * understand FLG.FEXTRA).
107: */
108: public static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X',
109: 4, 0, 0, 0, 0, 0 };
110:
111: /**
112: * Key for the ARC Header IP field.
113: *
114: * Lowercased.
115: */
116: public static final String IP_HEADER_FIELD_KEY = "ip-address";
117:
118: /**
119: * Key for the ARC Header Result Code field.
120: *
121: * Lowercased.
122: */
123: public static final String CODE_HEADER_FIELD_KEY = "result-code";
124:
125: /**
126: * Key for the ARC Header Checksum field.
127: *
128: * Lowercased.
129: */
130: public static final String CHECKSUM_HEADER_FIELD_KEY = "checksum";
131:
132: /**
133: * Key for the ARC Header Location field.
134: *
135: * Lowercased.
136: */
137: public static final String LOCATION_HEADER_FIELD_KEY = "location";
138:
139: /**
140: * Key for the ARC Header Offset field.
141: *
142: * Lowercased.
143: */
144: public static final String OFFSET_HEADER_FIELD_KEY = "offset";
145:
146: /**
147: * Key for the ARC Header filename field.
148: *
149: * Lowercased.
150: */
151: public static final String FILENAME_HEADER_FIELD_KEY = "filename";
152:
153: /**
154: * Key for statuscode field.
155: */
156: public static final String STATUSCODE_FIELD_KEY = "statuscode";
157:
158: /**
159: * Key for offset field.
160: */
161: public static final String OFFSET_FIELD_KEY = OFFSET_HEADER_FIELD_KEY;
162:
163: /**
164: * Key for filename field.
165: */
166: public static final String FILENAME_FIELD_KEY = FILENAME_HEADER_FIELD_KEY;
167:
168: /**
169: * Key for checksum field.
170: */
171: public static final String CHECKSUM_FIELD_KEY = CHECKSUM_HEADER_FIELD_KEY;
172:
173: /**
174: * Tokenized field prefix.
175: *
176: * Use this prefix for tokenized fields when naming fields in
177: * an index.
178: */
179: public static final String TOKENIZED_PREFIX = "tokenized_";
180:
181: /**
182: * Version 1 required metadata fields.
183: */
184: public static List REQUIRED_VERSION_1_HEADER_FIELDS = Arrays
185: .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY,
186: DATE_FIELD_KEY, MIMETYPE_FIELD_KEY,
187: LENGTH_FIELD_KEY, VERSION_FIELD_KEY,
188: ABSOLUTE_OFFSET_KEY });
189:
190: /**
191: * Minimum possible record length.
192: *
193: * This is a rough calc. When the header is data it will occupy less space.
194: */
195: public static int MINIMUM_RECORD_LENGTH = 1 + "://".length() + 1
196: + ARC_FILE_EXTENSION.length() + " ".length() + +1
197: + " ".length() + 1 + " ".length() + 1 + "/".length() + 1
198: + " ".length() + 1;
199:
200: /**
201: * Start of a GZIP header that uses default deflater.
202: */
203: public static final byte[] GZIP_HEADER_BEGIN = {
204: (byte) GZIPInputStream.GZIP_MAGIC, // Magic number (short)
205: (byte) (GZIPInputStream.GZIP_MAGIC >> 8), // Magic number (short)
206: Deflater.DEFLATED // Compression method (CM)
207: };
208:
209: /**
210: * Length of minimual 'default GZIP header.
211: *
212: * See RFC1952 for explaination of value of 10.
213: */
214: public static final int DEFAULT_GZIP_HEADER_LENGTH = GzipHeader.MINIMAL_GZIP_HEADER_LENGTH;
215: }
|