001: /* $Id: ArchiveRecord.java 4646 2006-09-22 17:23:04Z paul_jack $
002: *
003: * Created on August 21st, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io;
024:
025: import java.io.IOException;
026: import java.io.InputStream;
027: import java.io.OutputStream;
028: import java.security.MessageDigest;
029: import java.security.NoSuchAlgorithmException;
030: import java.util.logging.Level;
031:
032: import org.archive.util.Base32;
033:
034: /**
035: * Archive file Record.
036: * @author stack
037: * @version $Date: 2006-09-22 17:23:04 +0000 (Fri, 22 Sep 2006) $ $Version$
038: */
039: public abstract class ArchiveRecord extends InputStream {
040: ArchiveRecordHeader header = null;
041:
042: /**
043: * Stream to read this record from.
044: *
045: * Stream can only be read sequentially. Will only return this records'
046: * content returning a -1 if you try to read beyond the end of the current
047: * record.
048: *
049: * <p>Streams can be markable or not. If they are, we'll be able to roll
050: * back when we've read too far. If not markable, assumption is that
051: * the underlying stream is managing our not reading too much (This pertains
052: * to the skipping over the end of the ARCRecord. See {@link #skip()}.
053: */
054: InputStream in = null;
055:
056: /**
057: * Position w/i the Record content, within <code>in</code>.
058: * This position is relative within this Record. Its not same as the
059: * Archive file position.
060: */
061: long position = 0;
062:
063: /**
064: * Set flag when we've reached the end-of-record.
065: */
066: boolean eor = false;
067:
068: /**
069: * Compute digest on what we read and add to metadata when done.
070: *
071: * Currently hardcoded as sha-1. TODO: Remove when archive records
072: * digest or else, add a facility that allows the arc reader to
073: * compare the calculated digest to that which is recorded in
074: * the arc.
075: *
076: * <p>Protected instead of private so subclasses can update and complete
077: * the digest.
078: */
079: protected MessageDigest digest = null;
080: private String digestStr = null;
081:
082: boolean strict = false;
083:
084: private ArchiveRecord() {
085: super ();
086: }
087:
088: /**
089: * Constructor.
090: *
091: * @param in Stream cue'd up to be at the start of the record this instance
092: * is to represent.
093: * @throws IOException
094: */
095: public ArchiveRecord(InputStream in) throws IOException {
096: this (in, null, 0, true, false);
097: }
098:
099: /**
100: * Constructor.
101: *
102: * @param in Stream cue'd up to be at the start of the record this instance
103: * is to represent.
104: * @param header Header data.
105: * @throws IOException
106: */
107: public ArchiveRecord(InputStream in, ArchiveRecordHeader header)
108: throws IOException {
109: this (in, header, 0, true, false);
110: }
111:
112: /**
113: * Constructor.
114: *
115: * @param in Stream cue'd up to be at the start of the record this instance
116: * is to represent.
117: * @param header Header data.
118: * @param bodyOffset Offset into the body. Usually 0.
119: * @param digest True if we're to calculate digest for this record. Not
120: * digesting saves about ~15% of cpu during an ARC parse.
121: * @param strict Be strict parsing (Parsing stops if ARC inproperly
122: * formatted).
123: * @throws IOException
124: */
125: public ArchiveRecord(InputStream in, ArchiveRecordHeader header,
126: int bodyOffset, boolean digest, boolean strict)
127: throws IOException {
128: this .in = in;
129: this .header = header;
130: this .position = bodyOffset;
131: if (digest) {
132: try {
133: this .digest = MessageDigest.getInstance("SHA1");
134: } catch (NoSuchAlgorithmException e) {
135: // Convert to IOE because thats more amenable to callers
136: // -- they are dealing with it anyways.
137: throw new IOException(e.getMessage());
138: }
139: }
140: this .strict = strict;
141: }
142:
143: public boolean markSupported() {
144: return false;
145: }
146:
147: /**
148: * @return Header data for this record.
149: */
150: public ArchiveRecordHeader getHeader() {
151: return this .header;
152: }
153:
154: protected void setHeader(ArchiveRecordHeader header) {
155: this .header = header;
156: }
157:
158: /**
159: * Calling close on a record skips us past this record to the next record
160: * in the stream.
161: *
162: * It does not actually close the stream. The underlying steam is probably
163: * being used by the next arc record.
164: *
165: * @throws IOException
166: */
167: public void close() throws IOException {
168: if (this .in != null) {
169: skip();
170: this .in = null;
171: if (this .digest != null) {
172: this .digestStr = Base32.encode(this .digest.digest());
173: }
174: }
175: }
176:
177: /**
178: * @return Next character in this Record content else -1 if at EOR.
179: * @throws IOException
180: */
181: public int read() throws IOException {
182: int c = -1;
183: if (available() > 0) {
184: c = this .in.read();
185: if (c == -1) {
186: throw new IOException(
187: "Premature EOF before end-of-record.");
188: }
189: if (this .digest != null) {
190: this .digest.update((byte) c);
191: }
192: }
193: incrementPosition();
194: return c;
195: }
196:
197: public int read(byte[] b, int offset, int length)
198: throws IOException {
199: int read = Math.min(length, available());
200: if (read == -1 || read == 0) {
201: read = -1;
202: } else {
203: read = this .in.read(b, offset, read);
204: if (read == -1) {
205: String msg = "Premature EOF before end-of-record: "
206: + getHeader().getHeaderFields();
207: if (isStrict()) {
208: throw new IOException(msg);
209: }
210: setEor(true);
211: System.err
212: .println(Level.WARNING.toString() + " " + msg);
213: }
214: if (this .digest != null && read >= 0) {
215: this .digest.update(b, offset, read);
216: }
217: }
218: incrementPosition(read);
219: return read;
220: }
221:
222: /**
223: * This available is not the stream's available. Its an available based on
224: * what the stated Archive record length is minus what we've read to date.
225: *
226: * @return True if bytes remaining in record content.
227: */
228: public int available() {
229: return (int) (getHeader().getLength() - getPosition());
230: }
231:
232: /**
233: * Skip over this records content.
234: *
235: * @throws IOException
236: */
237: void skip() throws IOException {
238: if (this .eor) {
239: return;
240: }
241:
242: // Read to the end of the body of the record. Exhaust the stream.
243: // Can't skip direct to end because underlying stream may be compressed
244: // and we're calculating the digest for the record.
245: if (available() > 0) {
246: skip(available());
247: }
248: }
249:
250: public long skip(long n) throws IOException {
251: final int SKIP_BUFFERSIZE = 1024 * 4;
252: byte[] b = new byte[SKIP_BUFFERSIZE];
253: long total = 0;
254: for (int read = 0; (total < n) && (read != -1);) {
255: read = Math.min(SKIP_BUFFERSIZE, (int) (n - total));
256: // TODO: Interesting is that reading from compressed stream, we only
257: // read about 500 characters at a time though we ask for 4k.
258: // Look at this sometime.
259: read = read(b, 0, read);
260: if (read <= 0) {
261: read = -1;
262: } else {
263: total += read;
264: }
265: }
266: return total;
267: }
268:
269: /**
270: * @return Returns the strict.
271: */
272: public boolean isStrict() {
273: return this .strict;
274: }
275:
276: /**
277: * @param strict The strict to set.
278: */
279: public void setStrict(boolean strict) {
280: this .strict = strict;
281: }
282:
283: protected InputStream getIn() {
284: return this .in;
285: }
286:
287: public String getDigestStr() {
288: return this .digestStr;
289: }
290:
291: protected void incrementPosition() {
292: this .position++;
293: }
294:
295: protected void incrementPosition(final long incr) {
296: this .position += incr;
297: }
298:
299: protected long getPosition() {
300: return this .position;
301: }
302:
303: protected boolean isEor() {
304: return eor;
305: }
306:
307: protected void setEor(boolean eor) {
308: this .eor = eor;
309: }
310:
311: protected String getStatusCode4Cdx(final ArchiveRecordHeader h) {
312: return "-";
313: }
314:
315: protected String getIp4Cdx(final ArchiveRecordHeader h) {
316: return "-";
317: }
318:
319: protected String getDigest4Cdx(final ArchiveRecordHeader h) {
320: return getDigestStr() == null ? "-" : getDigestStr();
321: }
322:
323: protected String getMimetype4Cdx(final ArchiveRecordHeader h) {
324: return h.getMimetype();
325: }
326:
327: protected String outputCdx(final String strippedFileName)
328: throws IOException {
329: // Read the whole record so we get out a hash. Should be safe calling
330: // close on already closed Record.
331: close();
332: ArchiveRecordHeader h = getHeader();
333: StringBuilder buffer = new StringBuilder(
334: ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
335: buffer.append(h.getDate());
336: buffer.append(ArchiveFileConstants.SINGLE_SPACE);
337: buffer.append(getIp4Cdx(h));
338: buffer.append(ArchiveFileConstants.SINGLE_SPACE);
339: buffer.append(h.getUrl());
340: buffer.append(ArchiveFileConstants.SINGLE_SPACE);
341: buffer.append(getMimetype4Cdx(h));
342: buffer.append(ArchiveFileConstants.SINGLE_SPACE);
343: buffer.append(getStatusCode4Cdx(h));
344: buffer.append(ArchiveFileConstants.SINGLE_SPACE);
345: buffer.append(getDigest4Cdx(h));
346: buffer.append(ArchiveFileConstants.SINGLE_SPACE);
347: buffer.append(h.getOffset());
348: buffer.append(ArchiveFileConstants.SINGLE_SPACE);
349: buffer.append(h.getLength());
350: buffer.append(ArchiveFileConstants.SINGLE_SPACE);
351: buffer
352: .append(strippedFileName != null ? strippedFileName
353: : '-');
354: return buffer.toString();
355: }
356:
357: /**
358: * Writes output on STDOUT.
359: * @throws IOException
360: */
361: public void dump() throws IOException {
362: dump(System.out);
363: }
364:
365: /**
366: * Writes output on passed <code>os</code>.
367: * @throws IOException
368: */
369: public void dump(final OutputStream os) throws IOException {
370: final byte[] outputBuffer = new byte[16 * 1024];
371: int read = outputBuffer.length;
372: while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) {
373: os.write(outputBuffer, 0, read);
374: }
375: os.flush();
376: }
377: }
|