001: /* $Id: ExperimentalWARCWriter.java 4604 2006-09-06 05:38:18Z stack-sf $
002: *
003: * Created on July 27th, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.warc;
024:
025: import java.io.ByteArrayInputStream;
026: import java.io.ByteArrayOutputStream;
027: import java.io.File;
028: import java.io.IOException;
029: import java.io.InputStream;
030: import java.io.OutputStream;
031: import java.net.URI;
032: import java.net.URISyntaxException;
033: import java.util.Iterator;
034: import java.util.List;
035: import java.util.Map;
036: import java.util.concurrent.atomic.AtomicInteger;
037:
038: import org.archive.io.UTF8Bytes;
039: import org.archive.io.WriterPoolMember;
040: import org.archive.uid.GeneratorFactory;
041: import org.archive.util.ArchiveUtils;
042: import org.archive.util.anvl.ANVLRecord;
043:
044: /**
045: * <b>Experimental</b> WARC implementation.
046: *
047: * <p>Assumption is that the caller is managing access to this
048: * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance
049: * at any one time.
050: *
051: * <p>While being written, WARCs have a '.open' suffix appended.
052: *
053: * @author stack
054: * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
055: */
056: public class ExperimentalWARCWriter extends WriterPoolMember implements
057: WARCConstants {
058: /**
059: * Buffer to reuse writing streams.
060: */
061: private final byte[] readbuffer = new byte[16 * 1024];
062:
063: /**
064: * NEWLINE as bytes.
065: */
066: public static byte[] CRLF_BYTES;
067: static {
068: try {
069: CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
070: } catch (Exception e) {
071: e.printStackTrace();
072: }
073: };
074:
075: /**
076: * Metadata.
077: * TODO: Exploit writing warcinfo record. Currently unused.
078: */
079: private final List fileMetadata;
080:
081: /**
082: * Shutdown Constructor
083: * Has default access so can make instance to test utility methods.
084: */
085: ExperimentalWARCWriter() {
086: this (null, null, "", "", true, -1, null);
087: }
088:
089: /**
090: * Constructor.
091: * Takes a stream. Use with caution. There is no upperbound check on size.
092: * Will just keep writing. Only pass Streams that are bounded.
093: * @param serialNo used to generate unique file name sequences
094: * @param out Where to write.
095: * @param f File the <code>out</code> is connected to.
096: * @param cmprs Compress the content written.
097: * @param a14DigitDate If null, we'll write current time.
098: * @throws IOException
099: */
100: public ExperimentalWARCWriter(final AtomicInteger serialNo,
101: final OutputStream out, final File f, final boolean cmprs,
102: final String a14DigitDate, final List warcinfoData)
103: throws IOException {
104: super (serialNo, out, f, cmprs, a14DigitDate);
105: this .fileMetadata = warcinfoData;
106: }
107:
108: /**
109: * Constructor.
110: *
111: * @param dirs Where to drop files.
112: * @param prefix File prefix to use.
113: * @param cmprs Compress the records written.
114: * @param maxSize Maximum size for ARC files written.
115: * @param suffix File tail to use. If null, unused.
116: * @param warcinfoData File metadata for warcinfo record.
117: */
118: public ExperimentalWARCWriter(final AtomicInteger serialNo,
119: final List<File> dirs, final String prefix,
120: final String suffix, final boolean cmprs,
121: final long maxSize, final List warcinfoData) {
122: super (serialNo, dirs, prefix, suffix, cmprs, maxSize,
123: WARC_FILE_EXTENSION);
124: this .fileMetadata = warcinfoData;
125: }
126:
127: @Override
128: protected String createFile(File file) throws IOException {
129: String filename = super .createFile(file);
130: writeWarcinfoRecord(filename);
131: return filename;
132: }
133:
134: protected void baseCharacterCheck(final char c,
135: final String parameter) throws IOException {
136: // TODO: Too strict? UNICODE control characters?
137: if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
138: throw new IOException("Contains illegal character 0x"
139: + Integer.toHexString(c) + ": " + parameter);
140: }
141: }
142:
143: protected String checkHeaderValue(final String value)
144: throws IOException {
145: for (int i = 0; i < value.length(); i++) {
146: final char c = value.charAt(i);
147: baseCharacterCheck(c, value);
148: if (Character.isWhitespace(c)) {
149: throw new IOException(
150: "Contains disallowed white space 0x"
151: + Integer.toHexString(c) + ": " + value);
152: }
153: }
154: return value;
155: }
156:
157: protected String checkHeaderLineMimetypeParameter(
158: final String parameter) throws IOException {
159: StringBuilder sb = new StringBuilder(parameter.length());
160: boolean wasWhitespace = false;
161: for (int i = 0; i < parameter.length(); i++) {
162: char c = parameter.charAt(i);
163: if (Character.isWhitespace(c)) {
164: // Map all to ' ' and collapse multiples into one.
165: // TODO: Make sure white space occurs in legal location --
166: // before parameter or inside quoted-string.
167: if (wasWhitespace) {
168: continue;
169: }
170: wasWhitespace = true;
171: c = ' ';
172: } else {
173: wasWhitespace = false;
174: baseCharacterCheck(c, parameter);
175: }
176: sb.append(c);
177: }
178:
179: return sb.toString();
180: }
181:
182: protected String createRecordHeader(final String type,
183: final String url, final String create14DigitDate,
184: final String mimetype, final URI recordId,
185: final ANVLRecord xtraHeaders, final long contentLength)
186: throws IOException {
187: final StringBuilder sb = new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/);
188: sb.append(WARC_ID).append(CRLF);
189: sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(type)
190: .append(CRLF);
191: // Do not write a subject-uri if not one present.
192: if (url != null && url.length() > 0) {
193: sb.append(HEADER_KEY_URI).append(COLON_SPACE).append(
194: checkHeaderValue(url)).append(CRLF);
195: }
196: sb.append(HEADER_KEY_DATE).append(COLON_SPACE).append(
197: create14DigitDate).append(CRLF);
198: if (xtraHeaders != null) {
199: for (final Iterator i = xtraHeaders.iterator(); i.hasNext();) {
200: sb.append(i.next()).append(CRLF);
201: }
202: }
203:
204: // TODO: Is MIME Version needed.
205: sb.append(MIME_VERSION).append(CRLF);
206: sb.append(CONTENT_ID).append(COLON_SPACE).append('<').append(
207: recordId.toString()).append('>').append(CRLF);
208: if (contentLength > 0) {
209: sb.append(CONTENT_TYPE).append(COLON_SPACE).append(
210: checkHeaderLineMimetypeParameter(mimetype)).append(
211: CRLF);
212: }
213: sb.append(CONTENT_LENGTH).append(COLON_SPACE).append(
214: Long.toString(contentLength)).append(CRLF);
215:
216: return sb.toString();
217: }
218:
219: protected void writeRecord(final String type, final String url,
220: final String create14DigitDate, final String mimetype,
221: final URI recordId, ANVLRecord xtraHeaders,
222: final InputStream contentStream, final long contentLength)
223: throws IOException {
224: if (!TYPES_LIST.contains(type)) {
225: throw new IllegalArgumentException("Unknown record type: "
226: + type);
227: }
228: if (contentLength == 0
229: && (xtraHeaders == null || xtraHeaders.size() <= 0)) {
230: throw new IllegalArgumentException("Cannot write record "
231: + "of content-length zero and base headers only.");
232: }
233:
234: preWriteRecordTasks();
235: try {
236: final String header = createRecordHeader(type, url,
237: create14DigitDate, mimetype, recordId, xtraHeaders,
238: contentLength);
239: // TODO: Revisit endcoding of header.
240: write(header.getBytes(WARC_HEADER_ENCODING));
241:
242: if (contentStream != null && contentLength > 0) {
243: // Write out the header/body separator.
244: write(CRLF_BYTES); // TODO: should this be written even for zero-length?
245: readToLimitFrom(contentStream, contentLength,
246: this .readbuffer);
247: }
248:
249: // Write out the two blank lines at end of all records.
250: // TODO: Why? Messes up skipping through file. Also not in grammar.
251: write(CRLF_BYTES);
252: write(CRLF_BYTES);
253: } finally {
254: postWriteRecordTasks();
255: }
256: }
257:
258: protected URI generateRecordId(final Map<String, String> qualifiers)
259: throws IOException {
260: URI rid = null;
261: try {
262: rid = GeneratorFactory.getFactory().getQualifiedRecordID(
263: qualifiers);
264: } catch (URISyntaxException e) {
265: // Convert to IOE so can let it out.
266: throw new IOException(e.getMessage());
267: }
268: return rid;
269: }
270:
271: protected URI generateRecordId(final String key, final String value)
272: throws IOException {
273: URI rid = null;
274: try {
275: rid = GeneratorFactory.getFactory().getQualifiedRecordID(
276: key, value);
277: } catch (URISyntaxException e) {
278: // Convert to IOE so can let it out.
279: throw new IOException(e.getMessage());
280: }
281: return rid;
282: }
283:
284: public URI writeWarcinfoRecord(String filename) throws IOException {
285: return writeWarcinfoRecord(filename, null);
286: }
287:
288: public URI writeWarcinfoRecord(String filename,
289: final String description) throws IOException {
290: // Strip .open suffix if present.
291: if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
292: filename = filename.substring(0, filename.length()
293: - WriterPoolMember.OCCUPIED_SUFFIX.length());
294: }
295: ANVLRecord record = new ANVLRecord(2);
296: record.addLabelValue(HEADER_KEY_FILENAME, filename);
297: if (description != null && description.length() > 0) {
298: record.addLabelValue(CONTENT_DESCRIPTION, description);
299: }
300: // Add warcinfo body.
301: byte[] warcinfoBody = null;
302: if (this .fileMetadata == null) {
303: // TODO: What to write into a warcinfo? What to associate?
304: warcinfoBody = "TODO: Unimplemented".getBytes();
305: } else {
306: ByteArrayOutputStream baos = new ByteArrayOutputStream();
307: for (final Iterator i = this .fileMetadata.iterator(); i
308: .hasNext();) {
309: baos
310: .write(i.next().toString().getBytes(
311: UTF8Bytes.UTF8));
312: }
313: warcinfoBody = baos.toByteArray();
314: }
315: URI uri = writeWarcinfoRecord("text/xml", record,
316: new ByteArrayInputStream(warcinfoBody),
317: warcinfoBody.length);
318: // TODO: If at start of file, and we're writing compressed,
319: // write out our distinctive GZIP extensions.
320: return uri;
321: }
322:
323: /**
324: * Write a warcinfo to current file.
325: * TODO: Write crawl metadata or pointers to crawl description.
326: * @param mimetype Mimetype of the <code>fileMetadata</code> block.
327: * @param namedFields Named fields. Pass <code>null</code> if none.
328: * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
329: * @param fileMetadataLength Length of <code>fileMetadata</code>.
330: * @throws IOException
331: * @return Generated record-id made with
332: * <a href="http://en.wikipedia.org/wiki/Data:_URL">data: scheme</a> and
333: * the current filename.
334: */
335: public URI writeWarcinfoRecord(final String mimetype,
336: final ANVLRecord namedFields,
337: final InputStream fileMetadata,
338: final long fileMetadataLength) throws IOException {
339: final URI recordid = generateRecordId(TYPE, WARCINFO);
340: writeWarcinfoRecord(ArchiveUtils.getLog14Date(), mimetype,
341: recordid, namedFields, fileMetadata, fileMetadataLength);
342: return recordid;
343: }
344:
345: /**
346: * Write a <code>warcinfo</code> to current file.
347: * The <code>warcinfo</code> type uses its <code>recordId</code> as its URL.
348: * @param recordId URI to use for this warcinfo.
349: * @param create14DigitDate Record creation date as 14 digit date.
350: * @param mimetype Mimetype of the <code>fileMetadata</code>.
351: * @param namedFields Named fields.
352: * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
353: * @param fileMetadataLength Length of <code>fileMetadata</code>.
354: * @throws IOException
355: */
356: public void writeWarcinfoRecord(final String create14DigitDate,
357: final String mimetype, final URI recordId,
358: final ANVLRecord namedFields,
359: final InputStream fileMetadata,
360: final long fileMetadataLength) throws IOException {
361: writeRecord(WARCINFO, null, create14DigitDate, mimetype,
362: recordId, namedFields, fileMetadata, fileMetadataLength);
363: }
364:
365: public void writeRequestRecord(final String url,
366: final String create14DigitDate, final String mimetype,
367: final URI recordId, final ANVLRecord namedFields,
368: final InputStream request, final long requestLength)
369: throws IOException {
370: writeRecord(REQUEST, url, create14DigitDate, mimetype,
371: recordId, namedFields, request, requestLength);
372: }
373:
374: public void writeResourceRecord(final String url,
375: final String create14DigitDate, final String mimetype,
376: final ANVLRecord namedFields, final InputStream response,
377: final long responseLength) throws IOException {
378: writeResourceRecord(url, create14DigitDate, mimetype,
379: getRecordID(), namedFields, response, responseLength);
380: }
381:
382: public void writeResourceRecord(final String url,
383: final String create14DigitDate, final String mimetype,
384: final URI recordId, final ANVLRecord namedFields,
385: final InputStream response, final long responseLength)
386: throws IOException {
387: writeRecord(RESOURCE, url, create14DigitDate, mimetype,
388: recordId, namedFields, response, responseLength);
389: }
390:
391: public void writeResponseRecord(final String url,
392: final String create14DigitDate, final String mimetype,
393: final URI recordId, final ANVLRecord namedFields,
394: final InputStream response, final long responseLength)
395: throws IOException {
396: writeRecord(RESPONSE, url, create14DigitDate, mimetype,
397: recordId, namedFields, response, responseLength);
398: }
399:
400: public void writeRevisitRecord(final String url,
401: final String create14DigitDate, final String mimetype,
402: final URI recordId, final ANVLRecord namedFields,
403: final InputStream response, final long responseLength)
404: throws IOException {
405: writeRecord(REVISIT, url, create14DigitDate, mimetype,
406: recordId, namedFields, response, responseLength);
407: }
408:
409: public void writeMetadataRecord(final String url,
410: final String create14DigitDate, final String mimetype,
411: final URI recordId, final ANVLRecord namedFields,
412: final InputStream metadata, final long metadataLength)
413: throws IOException {
414: writeRecord(METADATA, url, create14DigitDate, mimetype,
415: recordId, namedFields, metadata, metadataLength);
416: }
417:
418: /**
419: * Convenience method for getting Record-Ids.
420: * @return A record ID.
421: * @throws IOException
422: */
423: public static URI getRecordID() throws IOException {
424: URI result;
425: try {
426: result = GeneratorFactory.getFactory().getRecordID();
427: } catch (URISyntaxException e) {
428: throw new IOException(e.toString());
429: }
430: return result;
431: }
432: }
|