001: /* ARCRecordMetaData
002: *
003: * $Id: ARCRecordMetaData.java 4547 2006-08-28 23:44:20Z stack-sf $
004: *
005: * Created on Jan 7, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.io.arc;
026:
027: import java.io.File;
028: import java.io.IOException;
029: import java.util.Iterator;
030: import java.util.Map;
031: import java.util.Set;
032:
033: import org.archive.io.ArchiveRecordHeader;
034:
035: /**
036: * An immutable class to hold an ARC record meta data.
037: *
038: * @author stack
039: */
040: public class ARCRecordMetaData implements ArchiveRecordHeader,
041: ARCConstants {
042: /**
043: * Map of record header fields.
044: *
045: * We store all in a hashmap. This way we can hold version 1 or
046: * version 2 record meta data.
047: *
048: * <p>Keys are lowercase.
049: */
050: protected Map headerFields = null;
051:
052: /**
053: * Digest for the record.
054: *
055: * Only available after the record has been read in totality.
056: */
057: private String digest = null;
058:
059: /**
060: * Status for this request.
061: *
062: * There may be no status.
063: */
064: private String statusCode = null;
065:
066: /**
067: * The arc this metadata came out.
068: * Descriptive String, either path or URL.
069: */
070: private String arc = null;
071:
072: private int contentBegin = 0;
073:
074: /**
075: * Shut down the default constructor.
076: */
077: protected ARCRecordMetaData() {
078: super ();
079: }
080:
081: /**
082: * Constructor.
083: *
084: * @param arc The arc file this metadata came out of.
085: * @param headerFields Hash of meta fields.
086: *
087: * @throws IOException
088: */
089: public ARCRecordMetaData(final String arc, Map headerFields)
090: throws IOException {
091: // Make sure the minimum required fields are present,
092: for (Iterator i = REQUIRED_VERSION_1_HEADER_FIELDS.iterator(); i
093: .hasNext();) {
094: testRequiredField(headerFields, (String) i.next());
095: }
096: this .headerFields = headerFields;
097: this .arc = arc;
098: }
099:
100: /**
101: * Test required field is present in hash.
102: *
103: * @param fields Map of fields.
104: * @param requiredField Field to test for.
105: *
106: * @exception IOException If required field is not present.
107: */
108: protected void testRequiredField(Map fields, String requiredField)
109: throws IOException {
110: if (!fields.containsKey(requiredField)) {
111: throw new IOException("Required field " + requiredField
112: + " not in meta data.");
113: }
114: }
115:
116: /**
117: * Get the time when the record was harvested.
118: * <p>
119: * Returns the date in Heritrix 14 digit time format (UTC). See the
120: * {@link org.archive.util.ArchiveUtils} class for converting to Java
121: * dates.
122: *
123: * @return Header date in Heritrix 14 digit format.
124: * @see org.archive.util.ArchiveUtils#parse14DigitDate(String)
125: */
126: public String getDate() {
127: return (String) this .headerFields.get(DATE_FIELD_KEY);
128: }
129:
130: /**
131: * @return Return length of the record.
132: */
133: public long getLength() {
134: return Long.parseLong((String) this .headerFields
135: .get(LENGTH_FIELD_KEY));
136: }
137:
138: /**
139: * @return Header url.
140: */
141: public String getUrl() {
142: return (String) this .headerFields.get(URL_FIELD_KEY);
143: }
144:
145: /**
146: * @return IP.
147: */
148: public String getIp() {
149: return (String) this .headerFields.get(IP_HEADER_FIELD_KEY);
150: }
151:
152: /**
153: * @return mimetype The mimetype that is in the ARC metaline -- NOT the http
154: * content-type content.
155: */
156: public String getMimetype() {
157: return (String) this .headerFields.get(MIMETYPE_FIELD_KEY);
158: }
159:
160: /**
161: * @return Arcfile version.
162: */
163: public String getVersion() {
164: return (String) this .headerFields.get(VERSION_FIELD_KEY);
165: }
166:
167: /**
168: * @return Offset into arcfile at which this record begins.
169: */
170: public long getOffset() {
171: return ((Long) this .headerFields.get(ABSOLUTE_OFFSET_KEY))
172: .longValue();
173: }
174:
175: /**
176: * @param key Key to use looking up field value.
177: * @return value for passed key of null if no such entry.
178: */
179: public Object getHeaderValue(String key) {
180: return this .headerFields.get(key);
181: }
182:
183: /**
184: * @return Header field name keys.
185: */
186: public Set getHeaderFieldKeys() {
187: return this .headerFields.keySet();
188: }
189:
190: /**
191: * @return Map of header fields.
192: */
193: public Map getHeaderFields() {
194: return this .headerFields;
195: }
196:
197: /**
198: * @return Returns identifier for ARC.
199: */
200: public String getArc() {
201: return this .arc;
202: }
203:
204: /**
205: * @return Convenience method that does a
206: * return new File(this.arc) (Be aware this.arc is not always
207: * full path to an ARC file -- may be an URL). Test
208: * returned file for existence.
209: */
210: public File getArcFile() {
211: return new File(this .arc);
212: }
213:
214: /**
215: * @return Returns the digest.
216: */
217: public String getDigest() {
218: return this .digest;
219: }
220:
221: /**
222: * @param d The digest to set.
223: */
224: public void setDigest(String d) {
225: this .digest = d;
226: }
227:
228: /**
229: * @return Returns the statusCode. May be null.
230: */
231: public String getStatusCode() {
232: return this .statusCode;
233: }
234:
235: /**
236: * @param statusCode The statusCode to set.
237: */
238: public void setStatusCode(String statusCode) {
239: this .statusCode = statusCode;
240: }
241:
242: public String toString() {
243: return ((this .arc != null) ? this .arc : "")
244: + ": "
245: + ((this .headerFields != null) ? this .headerFields
246: .toString() : "");
247: }
248:
249: public String getReaderIdentifier() {
250: return this .getArc();
251: }
252:
253: public String getRecordIdentifier() {
254: return getDate() + "/" + getUrl();
255: }
256:
257: public int getContentBegin() {
258: return this .contentBegin;
259: }
260:
261: void setContentBegin(final int offset) {
262: this.contentBegin = offset;
263: }
264: }
|