001: /* $Id: ExperimentalWARCWriter.java 4604 2006-09-06 05:38:18Z stack-sf $
002: *
003: * Created on July 27th, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.warc.v10;
024:
025: import java.io.ByteArrayInputStream;
026: import java.io.ByteArrayOutputStream;
027: import java.io.File;
028: import java.io.IOException;
029: import java.io.InputStream;
030: import java.io.OutputStream;
031: import java.net.URI;
032: import java.net.URISyntaxException;
033: import java.text.DecimalFormat;
034: import java.text.NumberFormat;
035: import java.util.Iterator;
036: import java.util.List;
037: import java.util.Map;
038: import java.util.concurrent.atomic.AtomicInteger;
039:
040: import org.archive.io.UTF8Bytes;
041: import org.archive.io.WriterPoolMember;
042: import org.archive.io.warc.WARCConstants;
043: import org.archive.uid.GeneratorFactory;
044: import org.archive.util.ArchiveUtils;
045: import org.archive.util.anvl.ANVLRecord;
046:
047: /**
048: * <b>Experimental</b> WARC implementation.
049: *
050: * Based on unreleased version 0.9 of <a
051: * href="http://archive-access.sourceforge.net//warc/warc_file_format.html">WARC
052: * File Format</a> document. Specification and implementation subject to
053: * change.
054: *
055: * <p>Assumption is that the caller is managing access to this
056: * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance
057: * at any one time.
058: *
059: * <p>While being written, WARCs have a '.open' suffix appended.
060: *
061: * @author stack
062: * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
063: */
064: public class ExperimentalWARCWriter extends WriterPoolMember implements
065: WARCConstants {
066: /**
067: * Buffer to reuse writing streams.
068: */
069: private final byte[] readbuffer = new byte[16 * 1024];
070:
071: /**
072: * NEWLINE as bytes.
073: */
074: public static byte[] CRLF_BYTES;
075: static {
076: try {
077: CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
078: } catch (Exception e) {
079: e.printStackTrace();
080: }
081: };
082:
083: /**
084: * Formatter for the length.
085: */
086: private static NumberFormat RECORD_LENGTH_FORMATTER = new DecimalFormat(
087: PLACEHOLDER_RECORD_LENGTH_STRING);
088:
089: /**
090: * Metadata.
091: * TODO: Exploit writing warcinfo record. Currently unused.
092: */
093: private final List fileMetadata;
094:
095: /**
096: * Shutdown Constructor
097: * Has default access so can make instance to test utility methods.
098: */
099: ExperimentalWARCWriter() {
100: this (null, null, "", "", true, -1, null);
101: }
102:
103: /**
104: * Constructor.
105: * Takes a stream. Use with caution. There is no upperbound check on size.
106: * Will just keep writing. Only pass Streams that are bounded.
107: * @param serialNo used to generate unique file name sequences
108: * @param out Where to write.
109: * @param f File the <code>out</code> is connected to.
110: * @param cmprs Compress the content written.
111: * @param a14DigitDate If null, we'll write current time.
112: * @throws IOException
113: */
114: public ExperimentalWARCWriter(final AtomicInteger serialNo,
115: final OutputStream out, final File f, final boolean cmprs,
116: final String a14DigitDate, final List warcinfoData)
117: throws IOException {
118: super (serialNo, out, f, cmprs, a14DigitDate);
119: // TODO: Currently unused.
120: this .fileMetadata = warcinfoData;
121: }
122:
123: /**
124: * Constructor.
125: *
126: * @param dirs Where to drop files.
127: * @param prefix File prefix to use.
128: * @param cmprs Compress the records written.
129: * @param maxSize Maximum size for ARC files written.
130: * @param suffix File tail to use. If null, unused.
131: * @param warcinfoData File metadata for warcinfo record.
132: */
133: public ExperimentalWARCWriter(final AtomicInteger serialNo,
134: final List<File> dirs, final String prefix,
135: final String suffix, final boolean cmprs,
136: final long maxSize, final List warcinfoData) {
137: super (serialNo, dirs, prefix, suffix, cmprs, maxSize,
138: WARC_FILE_EXTENSION);
139: // TODO: Currently unused.
140: this .fileMetadata = warcinfoData;
141: }
142:
143: @Override
144: protected String createFile(File file) throws IOException {
145: String filename = super .createFile(file);
146: writeWarcinfoRecord(filename);
147: return filename;
148: }
149:
150: protected void baseCharacterCheck(final char c,
151: final String parameter) throws IOException {
152: // TODO: Too strict? UNICODE control characters?
153: if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
154: throw new IOException("Contains illegal character 0x"
155: + Integer.toHexString(c) + ": " + parameter);
156: }
157: }
158:
159: protected String checkHeaderLineParameters(final String parameter)
160: throws IOException {
161: for (int i = 0; i < parameter.length(); i++) {
162: final char c = parameter.charAt(i);
163: baseCharacterCheck(c, parameter);
164: if (Character.isWhitespace(c)) {
165: throw new IOException(
166: "Contains disallowed white space 0x"
167: + Integer.toHexString(c) + ": "
168: + parameter);
169: }
170: }
171: return parameter;
172: }
173:
174: protected String checkHeaderLineMimetypeParameter(
175: final String parameter) throws IOException {
176: StringBuilder sb = new StringBuilder(parameter.length());
177: boolean wasWhitespace = false;
178: for (int i = 0; i < parameter.length(); i++) {
179: char c = parameter.charAt(i);
180: if (Character.isWhitespace(c)) {
181: // Map all to ' ' and collapse multiples into one.
182: // TODO: Make sure white space occurs in legal location --
183: // before parameter or inside quoted-string.
184: if (wasWhitespace) {
185: continue;
186: }
187: wasWhitespace = true;
188: c = ' ';
189: } else {
190: wasWhitespace = false;
191: baseCharacterCheck(c, parameter);
192: }
193: sb.append(c);
194: }
195:
196: return sb.toString();
197: }
198:
199: protected byte[] createRecordHeaderline(final String type,
200: final String url, final String create14DigitDate,
201: final String mimetype, final URI recordId,
202: final int namedFieldsLength, final long contentLength)
203: throws IOException {
204: final StringBuilder sb = new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/);
205: sb.append(WARC_010_ID);
206: sb.append(HEADER_FIELD_SEPARATOR);
207: sb.append(PLACEHOLDER_RECORD_LENGTH_STRING);
208: sb.append(HEADER_FIELD_SEPARATOR);
209: sb.append(type);
210: sb.append(HEADER_FIELD_SEPARATOR);
211: sb.append(checkHeaderLineParameters(url));
212: sb.append(HEADER_FIELD_SEPARATOR);
213: sb.append(checkHeaderLineParameters(create14DigitDate));
214: sb.append(HEADER_FIELD_SEPARATOR);
215: // 0.9 of spec. has mimetype second-to-last and recordid last on
216: // header line. Here we swap their positions and allow writing
217: // of full mimetypes rather than the curtailed type we used write into
218: // ARCs. These two deviations to be proposed as amendments to spec 0.9.
219: sb.append(checkHeaderLineParameters(recordId.toString()));
220: sb.append(HEADER_FIELD_SEPARATOR);
221: sb.append(checkHeaderLineMimetypeParameter(mimetype));
222: // Add terminating CRLF.
223: sb.append(CRLF);
224:
225: long length = sb.length() + namedFieldsLength + contentLength;
226:
227: // Insert length and pad out to fixed width with zero prefix to
228: // highlight 'fixed-widthness' of length.
229: int start = WARC_010_ID.length() + 1 /*HEADER_FIELD_SEPARATOR */;
230: int end = start + PLACEHOLDER_RECORD_LENGTH_STRING.length();
231: String lenStr = RECORD_LENGTH_FORMATTER.format(length);
232: sb.replace(start, end, lenStr);
233:
234: return sb.toString().getBytes(HEADER_LINE_ENCODING);
235: }
236:
237: protected void writeRecord(final String type, final String url,
238: final String create14DigitDate, final String mimetype,
239: final URI recordId, ANVLRecord namedFields,
240: final InputStream contentStream, final long contentLength)
241: throws IOException {
242: if (!TYPES_LIST.contains(type)) {
243: throw new IllegalArgumentException("Unknown record type: "
244: + type);
245: }
246: if (contentLength == 0
247: && (namedFields == null || namedFields.size() <= 0)) {
248: throw new IllegalArgumentException(
249: "Cannot have a record made "
250: + "of a Header line only (Content and Named Fields are empty).");
251: }
252:
253: preWriteRecordTasks();
254: try {
255: if (namedFields == null) {
256: // Use the empty anvl record so the length of blank line on
257: // end gets counted as part of the record length.
258: namedFields = ANVLRecord.EMPTY_ANVL_RECORD;
259: }
260:
261: // Serialize metadata first so we have metadata length.
262: final byte[] namedFieldsBlock = namedFields.getUTF8Bytes();
263: // Now serialize the Header line.
264: final byte[] header = createRecordHeaderline(type, url,
265: create14DigitDate, mimetype, recordId,
266: namedFieldsBlock.length, contentLength);
267: write(header);
268: write(namedFieldsBlock);
269: if (contentStream != null && contentLength > 0) {
270: readFullyFrom(contentStream, contentLength,
271: this .readbuffer);
272: }
273:
274: // Write out the two blank lines at end of all records.
275: // TODO: Why? Messes up skipping through file. Also not in grammar.
276: write(CRLF_BYTES);
277: write(CRLF_BYTES);
278: } finally {
279: postWriteRecordTasks();
280: }
281: }
282:
283: protected URI generateRecordId(final Map<String, String> qualifiers)
284: throws IOException {
285: URI rid = null;
286: try {
287: rid = GeneratorFactory.getFactory().getQualifiedRecordID(
288: qualifiers);
289: } catch (URISyntaxException e) {
290: // Convert to IOE so can let it out.
291: throw new IOException(e.getMessage());
292: }
293: return rid;
294: }
295:
296: protected URI generateRecordId(final String key, final String value)
297: throws IOException {
298: URI rid = null;
299: try {
300: rid = GeneratorFactory.getFactory().getQualifiedRecordID(
301: key, value);
302: } catch (URISyntaxException e) {
303: // Convert to IOE so can let it out.
304: throw new IOException(e.getMessage());
305: }
306: return rid;
307: }
308:
309: public URI writeWarcinfoRecord(String filename) throws IOException {
310: return writeWarcinfoRecord(filename, null);
311: }
312:
313: public URI writeWarcinfoRecord(String filename,
314: final String description) throws IOException {
315: // Strip .open suffix if present.
316: if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
317: filename = filename.substring(0, filename.length()
318: - WriterPoolMember.OCCUPIED_SUFFIX.length());
319: }
320: ANVLRecord record = new ANVLRecord(2);
321: record.addLabelValue(NAMED_FIELD_WARCFILENAME, filename);
322: if (description != null && description.length() > 0) {
323: record.addLabelValue(NAMED_FIELD_DESCRIPTION, description);
324: }
325: // Add warcinfo body.
326: byte[] warcinfoBody = null;
327: if (this .fileMetadata == null) {
328: // TODO: What to write into a warcinfo? What to associate?
329: warcinfoBody = "TODO: Unimplemented".getBytes();
330: } else {
331: ByteArrayOutputStream baos = new ByteArrayOutputStream();
332: for (final Iterator i = this .fileMetadata.iterator(); i
333: .hasNext();) {
334: baos
335: .write(i.next().toString().getBytes(
336: UTF8Bytes.UTF8));
337: }
338: warcinfoBody = baos.toByteArray();
339: }
340: URI uri = writeWarcinfoRecord("text/plain", record,
341: new ByteArrayInputStream(warcinfoBody),
342: warcinfoBody.length);
343: // TODO: If at start of file, and we're writing compressed,
344: // write out our distinctive GZIP extensions.
345: return uri;
346: }
347:
348: /**
349: * Write a warcinfo to current file.
350: * TODO: Write crawl metadata or pointers to crawl description.
351: * @param mimetype Mimetype of the <code>fileMetadata</code> block.
352: * @param namedFields Named fields. Pass <code>null</code> if none.
353: * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
354: * @param fileMetadataLength Length of <code>fileMetadata</code>.
355: * @throws IOException
356: * @return Generated record-id made with
357: * <a href="http://en.wikipedia.org/wiki/Data:_URL">data: scheme</a> and
358: * the current filename.
359: */
360: public URI writeWarcinfoRecord(final String mimetype,
361: final ANVLRecord namedFields,
362: final InputStream fileMetadata,
363: final long fileMetadataLength) throws IOException {
364: final URI recordid = generateRecordId(TYPE, WARCINFO);
365: writeWarcinfoRecord(ArchiveUtils.get14DigitDate(), mimetype,
366: recordid, namedFields, fileMetadata, fileMetadataLength);
367: return recordid;
368: }
369:
370: /**
371: * Write a <code>warcinfo</code> to current file.
372: * The <code>warcinfo</code> type uses its <code>recordId</code> as its URL.
373: * @param recordId URI to use for this warcinfo.
374: * @param create14DigitDate Record creation date as 14 digit date.
375: * @param mimetype Mimetype of the <code>fileMetadata</code>.
376: * @param namedFields Named fields.
377: * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
378: * @param fileMetadataLength Length of <code>fileMetadata</code>.
379: * @throws IOException
380: */
381: public void writeWarcinfoRecord(final String create14DigitDate,
382: final String mimetype, final URI recordId,
383: final ANVLRecord namedFields,
384: final InputStream fileMetadata,
385: final long fileMetadataLength) throws IOException {
386: writeRecord(WARCINFO, recordId.toString(), create14DigitDate,
387: mimetype, recordId, namedFields, fileMetadata,
388: fileMetadataLength);
389: }
390:
391: public void writeRequestRecord(final String url,
392: final String create14DigitDate, final String mimetype,
393: final URI recordId, final ANVLRecord namedFields,
394: final InputStream request, final long requestLength)
395: throws IOException {
396: writeRecord(REQUEST, url, create14DigitDate, mimetype,
397: recordId, namedFields, request, requestLength);
398: }
399:
400: public void writeResourceRecord(final String url,
401: final String create14DigitDate, final String mimetype,
402: final ANVLRecord namedFields, final InputStream response,
403: final long responseLength) throws IOException {
404: writeResourceRecord(url, create14DigitDate, mimetype,
405: getRecordID(), namedFields, response, responseLength);
406: }
407:
408: public void writeResourceRecord(final String url,
409: final String create14DigitDate, final String mimetype,
410: final URI recordId, final ANVLRecord namedFields,
411: final InputStream response, final long responseLength)
412: throws IOException {
413: writeRecord(RESOURCE, url, create14DigitDate, mimetype,
414: recordId, namedFields, response, responseLength);
415: }
416:
417: public void writeResponseRecord(final String url,
418: final String create14DigitDate, final String mimetype,
419: final URI recordId, final ANVLRecord namedFields,
420: final InputStream response, final long responseLength)
421: throws IOException {
422: writeRecord(RESPONSE, url, create14DigitDate, mimetype,
423: recordId, namedFields, response, responseLength);
424: }
425:
426: public void writeMetadataRecord(final String url,
427: final String create14DigitDate, final String mimetype,
428: final URI recordId, final ANVLRecord namedFields,
429: final InputStream metadata, final long metadataLength)
430: throws IOException {
431: writeRecord(METADATA, url, create14DigitDate, mimetype,
432: recordId, namedFields, metadata, metadataLength);
433: }
434:
435: /**
436: * Convenience method for getting Record-Ids.
437: * @return A record ID.
438: * @throws IOException
439: */
440: public static URI getRecordID() throws IOException {
441: URI result;
442: try {
443: result = GeneratorFactory.getFactory().getRecordID();
444: } catch (URISyntaxException e) {
445: throw new IOException(e.toString());
446: }
447: return result;
448: }
449: }
|