001: /*
002: * ARCWriter
003: *
004: * $Id: ARCWriter.java 5029 2007-03-29 23:53:50Z gojomo $
005: *
006: * Created on Jun 5, 2003
007: *
008: * Copyright (C) 2003 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026: package org.archive.io.arc;
027:
028: import java.io.BufferedInputStream;
029: import java.io.ByteArrayOutputStream;
030: import java.io.File;
031: import java.io.FileInputStream;
032: import java.io.IOException;
033: import java.io.InputStream;
034: import java.io.PrintStream;
035: import java.io.UnsupportedEncodingException;
036: import java.util.Iterator;
037: import java.util.List;
038: import java.util.concurrent.atomic.AtomicInteger;
039: import java.util.logging.Logger;
040: import java.util.regex.Matcher;
041: import java.util.regex.Pattern;
042:
043: import org.archive.io.GzippedInputStream;
044: import org.archive.io.ReplayInputStream;
045: import org.archive.io.WriterPoolMember;
046: import org.archive.util.ArchiveUtils;
047: import org.archive.util.DevUtils;
048: import org.archive.util.MimetypeUtils;
049:
050: /**
051: * Write ARC files.
052: *
053: * Assumption is that the caller is managing access to this ARCWriter ensuring
054: * only one thread of control accessing this ARC file instance at any one time.
055: *
056: * <p>ARC files are described here:
057: * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
058: * File Format</a>. This class does version 1 of the ARC file format. It also
059: * writes version 1.1 which is version 1 with data stuffed into the body of the
060: * first arc record in the file, the arc file meta record itself.
061: *
062: * <p>An ARC file is three lines of meta data followed by an optional 'body' and
063: * then a couple of '\n' and then: record, '\n', record, '\n', record, etc.
064: * If we are writing compressed ARC files, then each of the ARC file records is
065: * individually gzipped and concatenated together to make up a single ARC file.
066: * In GZIP terms, each ARC record is a GZIP <i>member</i> of a total gzip'd
067: * file.
068: *
069: * <p>The GZIPping of the ARC file meta data is exceptional. It is GZIPped
070: * w/ an extra GZIP header, a special Internet Archive (IA) extra header field
071: * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is
072: * appended to the GZIP header). The extra field has little in it but its
073: * presence denotes this GZIP as an Internet Archive gzipped ARC. See RFC1952
074: * to learn about the GZIP header structure.
075: *
076: * <p>This class then does its GZIPping in the following fashion. Each GZIP
077: * member is written w/ a new instance of GZIPOutputStream -- actually
078: * ARCWriterGZIPOututStream so we can get access to the underlying stream.
079: * The underlying stream stays open across GZIPoutputStream instantiations.
080: * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the
081: * GZIPOutputStream output into a byte array, manipulating it adding the
082: * IA GZIP header, before writing to the stream.
083: *
084: * <p>I tried writing a resettable GZIPOutputStream and could make it work w/
085: * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib
086: * native call doesn't seem to like the notion of resetting -- so I gave up on
087: * it.
088: *
089: * <p>Because of such as the above and troubles with GZIPInputStream, we should
090: * write our own GZIP*Streams, ones that resettable and consious of gzip
091: * members.
092: *
093: * <p>This class will write until we hit >= maxSize. The check is done at
094: * record boundary. Records do not span ARC files. We will then close current
095: * file and open another and then continue writing.
096: *
097: * <p><b>TESTING: </b>Here is how to test that produced ARC files are good
098: * using the
099: * <a href="http://www.archive.org/web/researcher/tool_documentation.php">alexa
100: * ARC c-tools</a>:
101: * <pre>
102: * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
103: * /tmp/hx20040109230030-0.dat.gz
104: * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
105: * </pre>
106: * Examine the produced cdx file to make sure it makes sense. Search
107: * for 'no-type 0'. If found, then we're opening a gzip record w/o data to
108: * write. This is bad.
109: *
110: * <p>You can also do <code>gzip -t FILENAME</code> and it will tell you if the
111: * ARC makes sense to GZIP.
112: *
113: * <p>While being written, ARCs have a '.open' suffix appended.
114: *
115: * @author stack
116: */
117: public class ARCWriter extends WriterPoolMember implements ARCConstants {
118: private static final Logger logger = Logger
119: .getLogger(ARCWriter.class.getName());
120:
121: /**
122: * Metadata line pattern.
123: */
124: private static final Pattern METADATA_LINE_PATTERN = Pattern
125: .compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR
126: + "?)$");
127:
128: /**
129: * Buffer to reuse writing streams.
130: */
131: private final byte[] readbuffer = new byte[4 * 1024];
132:
133: private List metadata = null;
134:
135: /**
136: * Constructor.
137: * Takes a stream. Use with caution. There is no upperbound check on size.
138: * Will just keep writing.
139: *
140: * @param serialNo used to generate unique file name sequences
141: * @param out Where to write.
142: * @param arc File the <code>out</code> is connected to.
143: * @param cmprs Compress the content written.
144: * @param metadata File meta data. Can be null. Is list of File and/or
145: * String objects.
146: * @param a14DigitDate If null, we'll write current time.
147: * @throws IOException
148: */
149: public ARCWriter(final AtomicInteger serialNo,
150: final PrintStream out, final File arc, final boolean cmprs,
151: String a14DigitDate, final List metadata)
152: throws IOException {
153: super (serialNo, out, arc, cmprs, a14DigitDate);
154: this .metadata = metadata;
155: writeFirstRecord(a14DigitDate);
156: }
157:
158: /**
159: * Constructor.
160: *
161: * @param serialNo used to generate unique file name sequences
162: * @param dirs Where to drop the ARC files.
163: * @param prefix ARC file prefix to use. If null, we use
164: * DEFAULT_ARC_FILE_PREFIX.
165: * @param cmprs Compress the ARC files written. The compression is done
166: * by individually gzipping each record added to the ARC file: i.e. the
167: * ARC file is a bunch of gzipped records concatenated together.
168: * @param maxSize Maximum size for ARC files written.
169: */
170: public ARCWriter(final AtomicInteger serialNo,
171: final List<File> dirs, final String prefix,
172: final boolean cmprs, final long maxSize) {
173: this (serialNo, dirs, prefix, "", cmprs, maxSize, null);
174: }
175:
176: /**
177: * Constructor.
178: *
179: * @param serialNo used to generate unique file name sequences
180: * @param dirs Where to drop files.
181: * @param prefix File prefix to use.
182: * @param cmprs Compress the records written.
183: * @param maxSize Maximum size for ARC files written.
184: * @param suffix File tail to use. If null, unused.
185: * @param meta File meta data. Can be null. Is list of File and/or
186: * String objects.
187: */
188: public ARCWriter(final AtomicInteger serialNo,
189: final List<File> dirs, final String prefix,
190: final String suffix, final boolean cmprs,
191: final long maxSize, final List meta) {
192: super (serialNo, dirs, prefix, suffix, cmprs, maxSize,
193: ARC_FILE_EXTENSION);
194: this .metadata = meta;
195: }
196:
197: protected String createFile() throws IOException {
198: String name = super .createFile();
199: writeFirstRecord(getCreateTimestamp());
200: return name;
201: }
202:
203: private void writeFirstRecord(final String ts) throws IOException {
204: write(generateARCFileMetaData(ts));
205: }
206:
207: /**
208: * Write out the ARCMetaData.
209: *
210: * <p>Generate ARC file meta data. Currently we only do version 1 of the
211: * ARC file formats or version 1.1 when metadata has been supplied (We
212: * write it into the body of the first record in the arc file).
213: *
214: * <p>Version 1 metadata looks roughly like this:
215: *
216: * <pre>filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 \\
217: * 20040110013326 text/plain 77
218: * 1 0 InternetArchive
219: * URL IP-address Archive-date Content-type Archive-length
220: * </pre>
221: *
222: * <p>If compress is set, then we generate a header that has been gzipped
223: * in the Internet Archive manner. Such a gzipping enables the FEXTRA
224: * flag in the FLG field of the gzip header. It then appends an extra
225: * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'. The first two
226: * bytes are the length of the field and the last 6 bytes the Internet
227: * Archive header. To learn about GZIP format, see RFC1952. To learn
228: * about the Internet Archive extra header field, read the source for
229: * av_ziparc which can be found at
230: * <code>alexa/vista/alexa-tools-1.2/src/av_ziparc.cc</code>.
231: *
232: * <p>We do things in this roundabout manner because the java
233: * GZIPOutputStream does not give access to GZIP header fields.
234: *
235: * @param date Date to put into the ARC metadata.
236: *
237: * @return Byte array filled w/ the arc header.
238: * @throws IOException
239: */
240: private byte[] generateARCFileMetaData(String date)
241: throws IOException {
242: int metadataBodyLength = getMetadataLength();
243: // If metadata body, then the minor part of the version is '1' rather
244: // than '0'.
245: String metadataHeaderLinesTwoAndThree = getMetadataHeaderLinesTwoAndThree("1 "
246: + ((metadataBodyLength > 0) ? "1" : "0"));
247: int recordLength = metadataBodyLength
248: + metadataHeaderLinesTwoAndThree
249: .getBytes(DEFAULT_ENCODING).length;
250: String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename()
251: + " 0.0.0.0 " + date + " text/plain " + recordLength
252: + metadataHeaderLinesTwoAndThree;
253: ByteArrayOutputStream metabaos = new ByteArrayOutputStream(
254: recordLength);
255: // Write the metadata header.
256: metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
257: // Write the metadata body, if anything to write.
258: if (metadataBodyLength > 0) {
259: writeMetaData(metabaos);
260: }
261:
262: // Write out a LINE_SEPARATORs to end this record.
263: metabaos.write(LINE_SEPARATOR);
264:
265: // Now get bytes of all just written and compress if flag set.
266: byte[] bytes = metabaos.toByteArray();
267:
268: if (isCompressed()) {
269: // GZIP the header but catch the gzipping into a byte array so we
270: // can add the special IA GZIP header to the product. After
271: // manipulations, write to the output stream (The JAVA GZIP
272: // implementation does not give access to GZIP header. It
273: // produces a 'default' header only). We can get away w/ these
274: // maniupulations because the GZIP 'default' header doesn't
275: // do the 'optional' CRC'ing of the header.
276: byte[] gzippedMetaData = GzippedInputStream.gzip(bytes);
277: if (gzippedMetaData[3] != 0) {
278: throw new IOException(
279: "The GZIP FLG header is unexpectedly "
280: + " non-zero. Need to add smarter code that can deal "
281: + " when already extant extra GZIP header fields.");
282: }
283: // Set the GZIP FLG header to '4' which says that the GZIP header
284: // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0,
285: // '0'} 'extra' field. The IA GZIP header will also set byte
286: // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same.
287: gzippedMetaData[3] = 4;
288: gzippedMetaData[9] = 3;
289: byte[] assemblyBuffer = new byte[gzippedMetaData.length
290: + ARC_GZIP_EXTRA_FIELD.length];
291: // '10' in the below is a pointer past the following bytes of the
292: // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See
293: // RFC1952 for explaination of the abbreviations just used.
294: System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
295: System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer,
296: 10, ARC_GZIP_EXTRA_FIELD.length);
297: System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
298: 10 + ARC_GZIP_EXTRA_FIELD.length,
299: gzippedMetaData.length - 10);
300: bytes = assemblyBuffer;
301: }
302: return bytes;
303: }
304:
305: public String getMetadataHeaderLinesTwoAndThree(String version) {
306: StringBuffer buffer = new StringBuffer();
307: buffer.append(LINE_SEPARATOR);
308: buffer.append(version);
309: buffer.append(" InternetArchive");
310: buffer.append(LINE_SEPARATOR);
311: buffer
312: .append("URL IP-address Archive-date Content-type Archive-length");
313: buffer.append(LINE_SEPARATOR);
314: return buffer.toString();
315: }
316:
317: /**
318: * Write all metadata to passed <code>baos</code>.
319: *
320: * @param baos Byte array to write to.
321: * @throws UnsupportedEncodingException
322: * @throws IOException
323: */
324: private void writeMetaData(ByteArrayOutputStream baos)
325: throws UnsupportedEncodingException, IOException {
326: if (this .metadata == null) {
327: return;
328: }
329:
330: for (Iterator i = this .metadata.iterator(); i.hasNext();) {
331: Object obj = i.next();
332: if (obj instanceof String) {
333: baos.write(((String) obj).getBytes(DEFAULT_ENCODING));
334: } else if (obj instanceof File) {
335: InputStream is = null;
336: try {
337: is = new BufferedInputStream(new FileInputStream(
338: (File) obj));
339: byte[] buffer = new byte[4096];
340: for (int read = -1; (read = is.read(buffer)) != -1;) {
341: baos.write(buffer, 0, read);
342: }
343: } finally {
344: if (is != null) {
345: is.close();
346: }
347: }
348: } else if (obj != null) {
349: logger.severe("Unsupported metadata type: " + obj);
350: }
351: }
352: return;
353: }
354:
355: /**
356: * @return Total length of metadata.
357: * @throws UnsupportedEncodingException
358: */
359: private int getMetadataLength() throws UnsupportedEncodingException {
360: int result = -1;
361: if (this .metadata == null) {
362: result = 0;
363: } else {
364: for (Iterator i = this .metadata.iterator(); i.hasNext();) {
365: Object obj = i.next();
366: if (obj instanceof String) {
367: result += ((String) obj).getBytes(DEFAULT_ENCODING).length;
368: } else if (obj instanceof File) {
369: result += ((File) obj).length();
370: } else {
371: logger.severe("Unsupported metadata type: " + obj);
372: }
373: }
374: }
375: return result;
376: }
377:
378: public void write(String uri, String contentType, String hostIP,
379: long fetchBeginTimeStamp, long recordLength,
380: ByteArrayOutputStream baos) throws IOException {
381: preWriteRecordTasks();
382: try {
383: write(getMetaLine(uri, contentType, hostIP,
384: fetchBeginTimeStamp, recordLength).getBytes(UTF8));
385: baos.writeTo(getOutputStream());
386: write(LINE_SEPARATOR);
387: } finally {
388: postWriteRecordTasks();
389: }
390: }
391:
392: public void write(String uri, String contentType, String hostIP,
393: long fetchBeginTimeStamp, long recordLength, InputStream in)
394: throws IOException {
395: preWriteRecordTasks();
396: try {
397: write(getMetaLine(uri, contentType, hostIP,
398: fetchBeginTimeStamp, recordLength).getBytes(UTF8));
399: readFullyFrom(in, recordLength, this .readbuffer);
400: write(LINE_SEPARATOR);
401: } finally {
402: postWriteRecordTasks();
403: }
404: }
405:
406: public void write(String uri, String contentType, String hostIP,
407: long fetchBeginTimeStamp, long recordLength,
408: ReplayInputStream ris) throws IOException {
409: preWriteRecordTasks();
410: try {
411: write(getMetaLine(uri, contentType, hostIP,
412: fetchBeginTimeStamp, recordLength).getBytes(UTF8));
413: try {
414: ris.readFullyTo(getOutputStream());
415: long remaining = ris.remaining();
416: // Should be zero at this stage. If not, something is
417: // wrong.
418: if (remaining != 0) {
419: String message = "Gap between expected and actual: "
420: + remaining
421: + LINE_SEPARATOR
422: + DevUtils.extraInfo()
423: + " writing arc "
424: + this .getFile().getAbsolutePath();
425: DevUtils
426: .warnHandle(new Throwable(message), message);
427: throw new IOException(message);
428: }
429: } finally {
430: ris.close();
431: }
432:
433: // Write out trailing newline
434: write(LINE_SEPARATOR);
435: } finally {
436: postWriteRecordTasks();
437: }
438: }
439:
440: /**
441: * @param uri
442: * @param contentType
443: * @param hostIP
444: * @param fetchBeginTimeStamp
445: * @param recordLength
446: * @return Metadata line for an ARCRecord made of passed components.
447: * @exception IOException
448: */
449: protected String getMetaLine(String uri, String contentType,
450: String hostIP, long fetchBeginTimeStamp, long recordLength)
451: throws IOException {
452: if (fetchBeginTimeStamp <= 0) {
453: throw new IOException("Bogus fetchBeginTimestamp: "
454: + Long.toString(fetchBeginTimeStamp));
455: }
456:
457: return validateMetaLine(createMetaline(uri, hostIP,
458: ArchiveUtils.get14DigitDate(fetchBeginTimeStamp),
459: MimetypeUtils.truncate(contentType), Long
460: .toString(recordLength)));
461: }
462:
463: public String createMetaline(String uri, String hostIP,
464: String timeStamp, String mimetype, String recordLength) {
465: return uri + HEADER_FIELD_SEPARATOR + hostIP
466: + HEADER_FIELD_SEPARATOR + timeStamp
467: + HEADER_FIELD_SEPARATOR + mimetype
468: + HEADER_FIELD_SEPARATOR + recordLength
469: + LINE_SEPARATOR;
470: }
471:
472: /**
473: * Test that the metadata line is valid before writing.
474: * @param metaLineStr
475: * @throws IOException
476: * @return The passed in metaline.
477: */
478: protected String validateMetaLine(String metaLineStr)
479: throws IOException {
480: if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
481: throw new IOException("Metadata line length is "
482: + metaLineStr.length()
483: + " which is > than maximum "
484: + MAX_METADATA_LINE_LENGTH);
485: }
486: Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr);
487: if (!m.matches()) {
488: throw new IOException(
489: "Metadata line doesn't match expected"
490: + " pattern: " + metaLineStr);
491: }
492: return metaLineStr;
493: }
494: }
|