001: /* $Id: WARCRecord.java 4566 2006-08-31 16:51:41Z stack-sf $
002: *
003: * Created on August 25th, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.warc;
024:
025: import it.unimi.dsi.fastutil.io.RepositionableStream;
026:
027: import java.io.IOException;
028: import java.io.InputStream;
029: import java.util.HashMap;
030: import java.util.Map;
031: import java.util.Set;
032: import java.util.regex.Matcher;
033: import java.util.regex.Pattern;
034:
035: import org.apache.commons.httpclient.Header;
036: import org.apache.commons.httpclient.HttpParser;
037: import org.archive.io.ArchiveRecord;
038: import org.archive.io.ArchiveRecordHeader;
039:
040: /**
041: * A WARC file Record.
042: *
043: * @author stack
044: */
045: public class WARCRecord extends ArchiveRecord implements WARCConstants {
046: private Pattern WHITESPACE = Pattern.compile("\\s");
047:
048: /**
049: * Constructor.
050: *
051: * @param in Stream cue'd up to be at the start of the record this instance
052: * is to represent.
053: * @throws IOException
054: */
055: public WARCRecord(InputStream in, final String identifier,
056: final long offset) throws IOException {
057: this (in, identifier, offset, true, false);
058: }
059:
060: /**
061: * Constructor.
062: * @param in Stream cue'd up just past Header Line and Named Fields.
063: * @param headers Header Line and ANVL Named fields.
064: * @throws IOException
065: */
066: public WARCRecord(InputStream in, ArchiveRecordHeader headers)
067: throws IOException {
068: super (in, headers, 0, true, false);
069: }
070:
071: /**
072: * Constructor.
073: *
074: * @param in Stream cue'd up to be at the start of the record this instance
075: * is to represent or, if <code>headers</code> is not null, just past the
076: * Header Line and Named Fields.
077: * @param identifier Identifier for this the hosting Reader.
078: * @param offset Current offset into <code>in</code> (Used to keep
079: * <code>position</code> properly aligned). Usually 0.
080: * @param digest True if we're to calculate digest for this record. Not
081: * digesting saves about ~15% of cpu during parse.
082: * @param strict Be strict parsing (Parsing stops if file inproperly
083: * formatted).
084: * @throws IOException
085: */
086: public WARCRecord(final InputStream in, final String identifier,
087: final long offset, boolean digest, boolean strict)
088: throws IOException {
089: super (in, null, 0, digest, strict);
090: setHeader(parseHeaders(in, identifier, offset, strict));
091: }
092:
093: /**
094: * Parse WARC Header Line and Named Fields.
095: * @param in Stream to read.
096: * @param identifier Identifier for the hosting Reader.
097: * @param offset Absolute offset into Reader.
098: * @param strict Whether to be loose parsing or not.
099: * @return An ArchiveRecordHeader.
100: * @throws IOException
101: */
102: protected ArchiveRecordHeader parseHeaders(final InputStream in,
103: final String identifier, final long offset,
104: final boolean strict) throws IOException {
105: final Map<Object, Object> m = new HashMap<Object, Object>();
106: m.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
107: m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
108:
109: long startPosition = -1;
110: if (in instanceof RepositionableStream) {
111: startPosition = ((RepositionableStream) in).position();
112: }
113: String firstLine = new String(HttpParser.readLine(in,
114: WARC_HEADER_ENCODING));
115: if (firstLine == null || firstLine.length() <= 0) {
116: throw new IOException("Failed to read WARC_MAGIC");
117: }
118: if (!firstLine.startsWith(WARC_MAGIC)) {
119: throw new IOException("Failed to find WARC MAGIC: "
120: + firstLine);
121: }
122: // Here we start reading off the inputstream but we're reading the
123: // stream direct rather than going via WARCRecord#read. The latter will
124: // keep count of bytes read, digest and fail properly if EOR too soon...
125: // We don't want digesting while reading Headers.
126: //
127: Header[] h = HttpParser.parseHeaders(in, WARC_HEADER_ENCODING);
128: for (int i = 0; i < h.length; i++) {
129: m.put(h[i].getName(), h[i].getValue());
130: }
131: int headerLength = -1;
132: if (in instanceof RepositionableStream) {
133: headerLength = (int) (((RepositionableStream) in)
134: .position() - startPosition);
135: }
136: final int contentOffset = headerLength;
137: incrementPosition(contentOffset);
138:
139: return new ArchiveRecordHeader() {
140: private Map<Object, Object> headers = m;
141: private int contentBegin = contentOffset;
142:
143: public String getDate() {
144: return (String) this .headers.get(HEADER_KEY_DATE);
145: }
146:
147: public String getDigest() {
148: return (String) this .headers.get(HEADER_KEY_CHECKSUM);
149: }
150:
151: public String getReaderIdentifier() {
152: return (String) this .headers
153: .get(READER_IDENTIFIER_FIELD_KEY);
154: }
155:
156: public Set getHeaderFieldKeys() {
157: return this .headers.keySet();
158: }
159:
160: public Map getHeaderFields() {
161: return this .headers;
162: }
163:
164: public Object getHeaderValue(String key) {
165: return this .headers.get(key);
166: }
167:
168: public long getLength() {
169: Object o = this .headers.get(CONTENT_LENGTH);
170: if (o == null) {
171: return -1;
172: }
173: long contentLength = (o instanceof Long) ? ((Long) o)
174: .longValue() : Long.parseLong((String) o);
175: return contentLength + contentOffset;
176: }
177:
178: public String getMimetype() {
179: return (String) this .headers.get(CONTENT_TYPE);
180: }
181:
182: public long getOffset() {
183: Object o = this .headers.get(ABSOLUTE_OFFSET_KEY);
184: if (o == null) {
185: return -1;
186: }
187: return (o instanceof Long) ? ((Long) o).longValue()
188: : Long.parseLong((String) o);
189: }
190:
191: public String getRecordIdentifier() {
192: return (String) this .headers
193: .get(RECORD_IDENTIFIER_FIELD_KEY);
194: }
195:
196: public String getUrl() {
197: return (String) this .headers.get(HEADER_KEY_URI);
198: }
199:
200: public String getVersion() {
201: return (String) this .headers.get(VERSION_FIELD_KEY);
202: }
203:
204: public int getContentBegin() {
205: return this .contentBegin;
206: }
207:
208: @Override
209: public String toString() {
210: return this .headers.toString();
211: }
212: };
213: }
214:
215: @Override
216: protected String getMimetype4Cdx(ArchiveRecordHeader h) {
217: final String m = super .getMimetype4Cdx(h);
218: // Mimetypes can have spaces in WARCs. Emitting for CDX, just
219: // squash them for now. Later, quote them since squashing spaces won't
220: // work for params that have quoted-string values.
221: Matcher matcher = WHITESPACE.matcher(m);
222: return matcher.replaceAll("");
223: }
224: }
|