001: /* ARCRecord
002: *
003: * $Id: ARCRecord.java 4988 2007-03-12 21:18:08Z stack-sf $
004: *
005: * Created on Jan 7, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.io.arc;
026:
027: import java.io.ByteArrayInputStream;
028: import java.io.ByteArrayOutputStream;
029: import java.io.IOException;
030: import java.io.InputStream;
031:
032: import org.apache.commons.httpclient.Header;
033: import org.apache.commons.httpclient.HttpParser;
034: import org.apache.commons.httpclient.StatusLine;
035: import org.apache.commons.httpclient.util.EncodingUtil;
036: import org.archive.io.ArchiveRecord;
037: import org.archive.io.ArchiveRecordHeader;
038: import org.archive.io.RecoverableIOException;
039:
040: /**
041: * An ARC file record.
042: * Does not compass the ARCRecord metadata line, just the record content.
043: * @author stack
044: */
045: public class ARCRecord extends ArchiveRecord implements ARCConstants {
046: /**
047: * Http status line object.
048: *
049: * May be null if record is not http.
050: */
051: private StatusLine httpStatus = null;
052:
053: /**
054: * Http header bytes.
055: *
056: * If non-null and bytes available, give out its contents before we
057: * go back to the underlying stream.
058: */
059: private InputStream httpHeaderStream = null;
060:
061: /**
062: * Http headers.
063: *
064: * Only populated after reading of headers.
065: */
066: private Header[] httpHeaders = null;
067:
068: /**
069: * Minimal http header length.
070: *
071: * I've seen in arcs content length of 1 with no
072: * header.
073: */
074: private static final long MIN_HTTP_HEADER_LENGTH = "HTTP/1.1 200 OK\r\n"
075: .length();
076:
077: /**
078: * Constructor.
079: *
080: * @param in Stream cue'd up to be at the start of the record this instance
081: * is to represent.
082: * @param metaData Meta data.
083: * @throws IOException
084: */
085: public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
086: throws IOException {
087: this (in, metaData, 0, true, false, true);
088: }
089:
090: /**
091: * Constructor.
092: *
093: * @param in Stream cue'd up to be at the start of the record this instance
094: * is to represent.
095: * @param metaData Meta data.
096: * @param bodyOffset Offset into the body. Usually 0.
097: * @param digest True if we're to calculate digest for this record. Not
098: * digesting saves about ~15% of cpu during an ARC parse.
099: * @param strict Be strict parsing (Parsing stops if ARC inproperly
100: * formatted).
101: * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
102: * about ~20% of CPU during an ARC parse.
103: * @throws IOException
104: */
105: public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
106: int bodyOffset, boolean digest, boolean strict,
107: final boolean parseHttpHeaders) throws IOException {
108: super (in, metaData, bodyOffset, digest, strict);
109: if (parseHttpHeaders) {
110: this .httpHeaderStream = readHttpHeader();
111: }
112: }
113:
114: /**
115: * Skip over the the http header if one present.
116: *
117: * Subsequent reads will get the body.
118: *
119: * <p>Calling this method in the midst of reading the header
120: * will make for strange results. Otherwise, safe to call
121: * at any time though before reading any of the arc record
122: * content is only time that it makes sense.
123: *
124: * <p>After calling this method, you can call
125: * {@link #getHttpHeaders()} to get the read http header.
126: *
127: * @throws IOException
128: */
129: public void skipHttpHeader() throws IOException {
130: if (this .httpHeaderStream != null) {
131: // Empty the httpHeaderStream
132: for (int available = this .httpHeaderStream.available(); this .httpHeaderStream != null
133: && (available = this .httpHeaderStream.available()) > 0;) {
134: // We should be in this loop once only we should only do this
135: // buffer allocation once.
136: byte[] buffer = new byte[available];
137: // The read nulls out httpHeaderStream when done with it so
138: // need check for null in the loop control line.
139: read(buffer, 0, available);
140: }
141: }
142: }
143:
144: public void dumpHttpHeader() throws IOException {
145: if (this .httpHeaderStream == null) {
146: return;
147: }
148: // Dump the httpHeaderStream to STDOUT
149: for (int available = this .httpHeaderStream.available(); this .httpHeaderStream != null
150: && (available = this .httpHeaderStream.available()) > 0;) {
151: // We should be in this loop only once and should do this
152: // buffer allocation once.
153: byte[] buffer = new byte[available];
154: // The read nulls out httpHeaderStream when done with it so
155: // need check for null in the loop control line.
156: int read = read(buffer, 0, available);
157: System.out.write(buffer, 0, read);
158: }
159: }
160:
161: /**
162: * Read http header if present. Technique borrowed from HttpClient HttpParse
163: * class.
164: *
165: * @return ByteArrayInputStream with the http header in it or null if no
166: * http header.
167: * @throws IOException
168: */
169: private InputStream readHttpHeader() throws IOException {
170: // If judged a record that doesn't have an http header, return
171: // immediately.
172: if (!getHeader().getUrl().startsWith("http")
173: || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
174: return null;
175: }
176: byte[] statusBytes = HttpParser.readRawLine(getIn());
177: int eolCharCount = getEolCharsCount(statusBytes);
178: if (eolCharCount <= 0) {
179: throw new IOException(
180: "Failed to read http status where one "
181: + " was expected: "
182: + new String(statusBytes));
183: }
184: String statusLine = EncodingUtil.getString(statusBytes, 0,
185: statusBytes.length - eolCharCount,
186: ARCConstants.DEFAULT_ENCODING);
187: if ((statusLine == null)
188: || !StatusLine.startsWithHTTP(statusLine)) {
189: if (statusLine.startsWith("DELETED")) {
190: // Some old ARCs have deleted records like following:
191: // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
192: // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
193: // (follows ~29K spaces)
194: // For now, throw a RecoverableIOException so if iterating over
195: // records, we keep going. TODO: Later make a legitimate
196: // ARCRecord from the deleted record rather than throw
197: // exception.
198: throw new DeletedARCRecordIOException(statusLine);
199: } else {
200: throw new IOException(
201: "Failed parse of http status line.");
202: }
203: }
204: this .httpStatus = new StatusLine(statusLine);
205:
206: // Save off all bytes read. Keep them as bytes rather than
207: // convert to strings so we don't have to worry about encodings
208: // though this should never be a problem doing http headers since
209: // its all supposed to be ascii.
210: ByteArrayOutputStream baos = new ByteArrayOutputStream(
211: statusBytes.length + 4 * 1024);
212: baos.write(statusBytes);
213:
214: // Now read rest of the header lines looking for the separation
215: // between header and body.
216: for (byte[] lineBytes = null; true;) {
217: lineBytes = HttpParser.readRawLine(getIn());
218: eolCharCount = getEolCharsCount(lineBytes);
219: if (eolCharCount <= 0) {
220: throw new IOException("Failed reading http headers: "
221: + ((lineBytes != null) ? new String(lineBytes)
222: : null));
223: }
224: // Save the bytes read.
225: baos.write(lineBytes);
226: if ((lineBytes.length - eolCharCount) <= 0) {
227: // We've finished reading the http header.
228: break;
229: }
230: }
231:
232: byte[] headerBytes = baos.toByteArray();
233: // Save off where body starts.
234: this .getMetaData().setContentBegin(headerBytes.length);
235: ByteArrayInputStream bais = new ByteArrayInputStream(
236: headerBytes);
237: if (!bais.markSupported()) {
238: throw new IOException(
239: "ByteArrayInputStream does not support mark");
240: }
241: bais.mark(headerBytes.length);
242: // Read the status line. Don't let it into the parseHeaders function.
243: // It doesn't know what to do with it.
244: bais.read(statusBytes, 0, statusBytes.length);
245: this .httpHeaders = HttpParser.parseHeaders(bais,
246: ARCConstants.DEFAULT_ENCODING);
247: this .getMetaData().setStatusCode(
248: Integer.toString(getStatusCode()));
249: bais.reset();
250: return bais;
251: }
252:
253: private static class DeletedARCRecordIOException extends
254: RecoverableIOException {
255: public DeletedARCRecordIOException(final String reason) {
256: super (reason);
257: }
258: }
259:
260: /**
261: * Return status code for this record.
262: *
263: * This method will return -1 until the http header has been read.
264: * @return Status code.
265: */
266: public int getStatusCode() {
267: return (this .httpStatus == null) ? -1 : this .httpStatus
268: .getStatusCode();
269: }
270:
271: /**
272: * @param bytes Array of bytes to examine for an EOL.
273: * @return Count of end-of-line characters or zero if none.
274: */
275: private int getEolCharsCount(byte[] bytes) {
276: int count = 0;
277: if (bytes != null && bytes.length >= 1
278: && bytes[bytes.length - 1] == '\n') {
279: count++;
280: if (bytes.length >= 2 && bytes[bytes.length - 2] == '\r') {
281: count++;
282: }
283: }
284: return count;
285: }
286:
287: /**
288: * @return Meta data for this record.
289: */
290: public ARCRecordMetaData getMetaData() {
291: return (ARCRecordMetaData) getHeader();
292: }
293:
294: /**
295: * @return http headers (Only available after header has been read).
296: */
297: public Header[] getHttpHeaders() {
298: return this .httpHeaders;
299: }
300:
301: /**
302: * @return Next character in this ARCRecord's content else -1 if at end of
303: * this record.
304: * @throws IOException
305: */
306: public int read() throws IOException {
307: int c = -1;
308: if (this .httpHeaderStream != null
309: && (this .httpHeaderStream.available() > 0)) {
310: // If http header, return bytes from it before we go to underlying
311: // stream.
312: c = this .httpHeaderStream.read();
313: // If done with the header stream, null it out.
314: if (this .httpHeaderStream.available() <= 0) {
315: this .httpHeaderStream = null;
316: }
317: incrementPosition();
318: } else {
319: c = super .read();
320: }
321: return c;
322: }
323:
324: public int read(byte[] b, int offset, int length)
325: throws IOException {
326: int read = -1;
327: if (this .httpHeaderStream != null
328: && (this .httpHeaderStream.available() > 0)) {
329: // If http header, return bytes from it before we go to underlying
330: // stream.
331: read = Math.min(length, this .httpHeaderStream.available());
332: if (read == 0) {
333: read = -1;
334: } else {
335: read = this .httpHeaderStream.read(b, offset, read);
336: }
337: // If done with the header stream, null it out.
338: if (this .httpHeaderStream.available() <= 0) {
339: this .httpHeaderStream = null;
340: }
341: incrementPosition(read);
342: } else {
343: read = super .read(b, offset, length);
344: }
345: return read;
346: }
347:
348: /**
349: * @return Offset at which the body begins (Only known after
350: * header has been read) or -1 if none or if we haven't read
351: * headers yet. Usually length of HTTP headers (does not include ARC
352: * metadata line length).
353: */
354: public int getBodyOffset() {
355: return this .getMetaData().getContentBegin();
356: }
357:
358: @Override
359: protected String getIp4Cdx(ArchiveRecordHeader h) {
360: String result = null;
361: if (h instanceof ARCRecordMetaData) {
362: result = ((ARCRecordMetaData) h).getIp();
363: }
364: return (result != null) ? result : super .getIp4Cdx(h);
365: }
366:
367: @Override
368: protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
369: String result = null;
370: if (h instanceof ARCRecordMetaData) {
371: result = ((ARCRecordMetaData) h).getStatusCode();
372: }
373: return (result != null) ? result : super .getStatusCode4Cdx(h);
374: }
375:
376: @Override
377: protected String getDigest4Cdx(ArchiveRecordHeader h) {
378: String result = null;
379: if (h instanceof ARCRecordMetaData) {
380: result = ((ARCRecordMetaData) h).getDigest();
381: }
382: return (result != null) ? result : super.getDigest4Cdx(h);
383: }
384: }
|