001: /* $Id: WARCRecord.java 4566 2006-08-31 16:51:41Z stack-sf $
002: *
003: * Created on August 25th, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.warc.v10;
024:
025: import java.io.ByteArrayOutputStream;
026: import java.io.IOException;
027: import java.io.InputStream;
028: import java.util.HashMap;
029: import java.util.Map;
030: import java.util.Set;
031: import java.util.regex.Matcher;
032: import java.util.regex.Pattern;
033:
034: import org.archive.io.ArchiveRecord;
035: import org.archive.io.ArchiveRecordHeader;
036: import org.archive.io.warc.WARCConstants;
037: import org.archive.util.LongWrapper;
038: import org.archive.util.anvl.ANVLRecord;
039:
040: /**
041: * A WARC file Record.
042: *
043: * @author stack
044: */
045: public class WARCRecord extends ArchiveRecord implements WARCConstants {
046: /**
047: * Header-Line pattern;
048: * I heart http://www.fileformat.info/tool/regex.htm
049: */
050: private final static Pattern HEADER_LINE = Pattern
051: .compile("^WARC/([0-9]+\\.[0-9]+(?:\\.[0-9]+)?)"
052: + // Regex group 1: WARC lead-in.
053: "[\\t ]+"
054: + // Multiple tabs or spaces.
055: "([0-9]+)"
056: + // Regex group 2: Length.
057: "[\\t ]+"
058: + // Multiple tabs or spaces.
059: "(request|response|warcinfo|resource|metadata|"
060: + "revisit|conversion)" + // Regex group 3: Type of WARC Record.
061: "[\\t ]+" + // Multiple tabs or spaces.
062: "([^\\t ]+)" + // Regex group 4: Subject-uri.
063: "[\\t ]+" + // Multiple tabs or spaces.
064: "([0-9]{14})" + // Regex group 5: Date
065: "[\\t ]+" + // Multiple tabs or spaces.
066: "([^\\t ]+)" + // Regex group 6: Record-Id
067: "[\\t ]+" + // Multiple tabs or spaces.
068: "(.+)$"); // Regex group 7: Mimetype.
069:
070: private Pattern WHITESPACE = Pattern.compile("\\s");
071:
072: /**
073: * Constructor.
074: *
075: * @param in Stream cue'd up to be at the start of the record this instance
076: * is to represent.
077: * @throws IOException
078: */
079: public WARCRecord(InputStream in, final String identifier,
080: final long offset) throws IOException {
081: this (in, identifier, offset, true, false);
082: }
083:
084: /**
085: * Constructor.
086: * @param in Stream cue'd up just past Header Line and Named Fields.
087: * @param headers Header Line and ANVL Named fields.
088: * @throws IOException
089: */
090: public WARCRecord(InputStream in, ArchiveRecordHeader headers)
091: throws IOException {
092: super (in, headers, 0, true, false);
093: }
094:
095: /**
096: * Constructor.
097: *
098: * @param in Stream cue'd up to be at the start of the record this instance
099: * is to represent or, if <code>headers</code> is not null, just past the
100: * Header Line and Named Fields.
101: * @param identifier Identifier for this the hosting Reader.
102: * @param offset Current offset into <code>in</code> (Used to keep
103: * <code>position</code> properly aligned). Usually 0.
104: * @param digest True if we're to calculate digest for this record. Not
105: * digesting saves about ~15% of cpu during parse.
106: * @param strict Be strict parsing (Parsing stops if file inproperly
107: * formatted).
108: * @throws IOException
109: */
110: public WARCRecord(final InputStream in, final String identifier,
111: final long offset, boolean digest, boolean strict)
112: throws IOException {
113: super (in, null, 0, digest, strict);
114: setHeader(parseHeaders(in, identifier, offset, strict));
115: }
116:
117: /**
118: * Parse WARC Header Line and Named Fields.
119: * @param in Stream to read.
120: * @param identifier Identifier for the hosting Reader.
121: * @param offset Absolute offset into Reader.
122: * @param strict Whether to be loose parsing or not.
123: * @return An ArchiveRecordHeader.
124: * @throws IOException
125: */
126: protected ArchiveRecordHeader parseHeaders(final InputStream in,
127: final String identifier, final long offset,
128: final boolean strict) throws IOException {
129: final Map<Object, Object> m = new HashMap<Object, Object>();
130: m.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
131: m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
132: // Here we start reading off the inputstream but we're reading the
133: // stream direct rather than going via WARCRecord#read. The latter will
134: // keep count of bytes read, digest and fail properly if EOR too soon...
135: // We don't want digesting while reading Header Line and Named Fields.
136: //
137: // The returned length includes terminating CRLF.
138: int headLineLength = parseHeaderLine(in, m, strict);
139:
140: // Now, doing the ANVL parse, hard to know how many bytes have been
141: // read since passed Stream doesn't keep count and the ANVL parse can
142: // throw away bytes (e.g. if white space padding at start of a folded
143: // Value or if a Value has a newline in it and it gets converted to a
144: // CRNL in the ANVL representation). Wrap the stream in a
145: // byte-counting stream.
146: //
147: // TODO: Buffering. Currently, we rely on the deflate buffer when
148: // file is gzipped. Otherwise, if uncompressed, no buffering.
149: final LongWrapper anvlParseLength = new LongWrapper(0);
150: InputStream countingStream = new InputStream() {
151: @Override
152: public int read() throws IOException {
153: int c = in.read();
154: if (c != -1) {
155: anvlParseLength.longValue++;
156: }
157: return c;
158: }
159: };
160: parseNamedFields(countingStream, m);
161: // Set offset at which content begins. Its the Header Line length plus
162: // whatever we read parsing ANVL.
163: final int contentOffset = (int) (headLineLength + anvlParseLength.longValue);
164: incrementPosition(contentOffset);
165:
166: return new ArchiveRecordHeader() {
167: private Map<Object, Object> fields = m;
168: private int contentBegin = contentOffset;
169:
170: public String getDate() {
171: return (String) this .fields.get(DATE_FIELD_KEY);
172: }
173:
174: public String getDigest() {
175: return (String) this .fields
176: .get(NAMED_FIELD_CHECKSUM_LABEL);
177: }
178:
179: public String getReaderIdentifier() {
180: return (String) this .fields
181: .get(READER_IDENTIFIER_FIELD_KEY);
182: }
183:
184: public Set getHeaderFieldKeys() {
185: return this .fields.keySet();
186: }
187:
188: public Map getHeaderFields() {
189: return this .fields;
190: }
191:
192: public Object getHeaderValue(String key) {
193: return this .fields.get(key);
194: }
195:
196: public long getLength() {
197: Object o = this .fields.get(LENGTH_FIELD_KEY);
198: if (o == null) {
199: return -1;
200: }
201: return ((Long) o).longValue();
202: }
203:
204: public String getMimetype() {
205: return (String) this .fields.get(MIMETYPE_FIELD_KEY);
206: }
207:
208: public long getOffset() {
209: Object o = this .fields.get(ABSOLUTE_OFFSET_KEY);
210: if (o == null) {
211: return -1;
212: }
213: return ((Long) o).longValue();
214: }
215:
216: public String getRecordIdentifier() {
217: return (String) this .fields
218: .get(RECORD_IDENTIFIER_FIELD_KEY);
219: }
220:
221: public String getUrl() {
222: return (String) this .fields.get(URL_FIELD_KEY);
223: }
224:
225: public String getVersion() {
226: return (String) this .fields.get(VERSION_FIELD_KEY);
227: }
228:
229: public int getContentBegin() {
230: return this .contentBegin;
231: }
232:
233: @Override
234: public String toString() {
235: return this .fields.toString();
236: }
237: };
238: }
239:
240: protected int parseHeaderLine(final InputStream in,
241: final Map<Object, Object> fields, final boolean strict)
242: throws IOException {
243: byte[] line = readLine(in, strict);
244: if (line.length <= 2) {
245: throw new IOException("No Header Line found");
246: }
247: // Strip the CRLF.
248: String headerLine = new String(line, 0, line.length - 2,
249: HEADER_LINE_ENCODING);
250: Matcher m = HEADER_LINE.matcher(headerLine);
251: if (!m.matches()) {
252: throw new IOException("Failed parse of Header Line: "
253: + headerLine);
254: }
255: for (int i = 0; i < HEADER_FIELD_KEYS.length; i++) {
256: if (i == 1) {
257: // Do length of Record as a Long.
258: fields.put(HEADER_FIELD_KEYS[i], Long.parseLong(m
259: .group(i + 1)));
260: continue;
261: }
262: fields.put(HEADER_FIELD_KEYS[i], m.group(i + 1));
263: }
264:
265: return line.length;
266: }
267:
268: /**
269: * Read a line.
270: * A 'line' in this context ends in CRLF and contains ascii-only and no
271: * control-characters.
272: * @param in InputStream to read.
273: * @param strict Strict parsing (If false, we'll eat whitespace before the
274: * record.
275: * @return All bytes in line including terminating CRLF.
276: * @throws IOException
277: */
278: protected byte[] readLine(final InputStream in, final boolean strict)
279: throws IOException {
280: boolean done = false;
281: boolean recordStart = strict;
282: int read = 0;
283: ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 /*SWAG*/);
284: for (int c = -1, previousCharacter; !done;) {
285: if (read++ >= MAX_LINE_LENGTH) {
286: throw new IOException("Read " + MAX_LINE_LENGTH
287: + " bytes without finding CRLF");
288: }
289: previousCharacter = c;
290: c = in.read();
291: if (c == -1) {
292: throw new IOException("End-Of-Stream before CRLF:\n"
293: + new String(baos.toByteArray()));
294: }
295: if (isLF((char) c) && isCR((char) previousCharacter)) {
296: done = true;
297: } else if (!recordStart && Character.isWhitespace(c)) {
298: // Skip any whitespace at start.
299: continue;
300: } else {
301: if (isCR((char) previousCharacter)) {
302: // If previous character was a CR and this character is not
303: // a LF, we tested above, thats illegal.
304: throw new IOException("CR in middle of Header:\n"
305: + new String(baos.toByteArray()));
306: }
307:
308: // Not whitespace so start record if we haven't already.
309: if (!recordStart) {
310: recordStart = true;
311: }
312: }
313: baos.write(c);
314: }
315: return baos.toByteArray();
316: }
317:
318: protected void parseNamedFields(final InputStream in,
319: final Map<Object, Object> fields) throws IOException {
320: ANVLRecord r = ANVLRecord.load(in);
321: fields.putAll(r.asMap());
322: }
323:
324: public static boolean isCROrLF(final char c) {
325: return isCR(c) || isLF(c);
326: }
327:
328: public static boolean isCR(final char c) {
329: return c == CRLF.charAt(0);
330: }
331:
332: public static boolean isLF(final char c) {
333: return c == CRLF.charAt(1);
334: }
335:
336: @Override
337: protected String getMimetype4Cdx(ArchiveRecordHeader h) {
338: final String m = super .getMimetype4Cdx(h);
339: // Mimetypes can have spaces in WARCs. Emitting for CDX, just
340: // squash them for now. Later, quote them since squashing spaces won't
341: // work for params that have quoted-string values.
342: Matcher matcher = WHITESPACE.matcher(m);
343: return matcher.replaceAll("");
344: }
345: }
|