001: /*
002: * ExperimentalWARCWriterTest
003: *
004: * $Id: ExperimentalWARCWriterTest.java 4554 2006-08-30 02:35:48Z stack-sf $
005: *
006: * Created on July 27th, 2006
007: *
008: * Copyright (C) 2006 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026: package org.archive.io.warc;
027:
028: import java.io.ByteArrayInputStream;
029: import java.io.ByteArrayOutputStream;
030: import java.io.File;
031: import java.io.FileNotFoundException;
032: import java.io.IOException;
033: import java.net.URI;
034: import java.net.URISyntaxException;
035: import java.util.Arrays;
036: import java.util.Iterator;
037: import java.util.List;
038: import java.util.concurrent.atomic.AtomicInteger;
039:
040: import org.archive.io.ArchiveRecord;
041: import org.archive.io.ArchiveRecordHeader;
042: import org.archive.io.UTF8Bytes;
043: import org.archive.io.WriterPoolMember;
044: import org.archive.io.warc.WARCConstants;
045: import org.archive.uid.GeneratorFactory;
046: import org.archive.util.ArchiveUtils;
047: import org.archive.util.TmpDirTestCase;
048: import org.archive.util.anvl.ANVLRecord;
049:
050: /**
051: * Test Writer and Reader.
052: * @author stack
053: * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
054: */
055: public class ExperimentalWARCWriterTest extends TmpDirTestCase
056: implements WARCConstants {
057: private static final AtomicInteger SERIAL_NO = new AtomicInteger();
058:
059: /**
060: * Prefix to use for ARC files made by JUNIT.
061: */
062: private static final String PREFIX = "IAH";
063:
064: private static final String SOME_URL = "http://www.archive.org/test/";
065:
066: public void testCheckHeaderLineValue() throws Exception {
067: ExperimentalWARCWriter writer = new ExperimentalWARCWriter();
068: writer.checkHeaderValue("one");
069: IOException exception = null;
070: try {
071: writer.checkHeaderValue("with space");
072: } catch (IOException e) {
073: exception = e;
074: }
075: assertNotNull(exception);
076: exception = null;
077: try {
078: writer.checkHeaderValue("with\0x0000controlcharacter");
079: } catch (IOException e) {
080: exception = e;
081: }
082: assertNotNull(exception);
083: }
084:
085: public void testMimetypes() throws IOException {
086: ExperimentalWARCWriter writer = new ExperimentalWARCWriter();
087: writer.checkHeaderLineMimetypeParameter("text/xml");
088: writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
089: assertEquals(
090: writer
091: .checkHeaderLineMimetypeParameter("text/plain; charset=SHIFT-JIS"),
092: "text/plain; charset=SHIFT-JIS");
093: assertEquals(
094: writer
095: .checkHeaderLineMimetypeParameter("multipart/mixed; \r\n boundary=\"simple boundary\""),
096: "multipart/mixed; boundary=\"simple boundary\"");
097: }
098:
099: public void testWriteRecord() throws IOException {
100: File[] files = { getTmpDir() };
101:
102: // Write uncompressed.
103: ExperimentalWARCWriter writer = new ExperimentalWARCWriter(
104: SERIAL_NO, Arrays.asList(files), this .getClass()
105: .getName(), "suffix", false, -1, null);
106: writeFile(writer);
107:
108: // Write compressed.
109: writer = new ExperimentalWARCWriter(SERIAL_NO, Arrays
110: .asList(files), this .getClass().getName(), "suffix",
111: true, -1, null);
112: writeFile(writer);
113: }
114:
115: private void writeFile(final ExperimentalWARCWriter writer)
116: throws IOException {
117: try {
118: writeWarcinfoRecord(writer);
119: writeBasicRecords(writer);
120: } finally {
121: writer.close();
122: writer.getFile().delete();
123: }
124: }
125:
126: private void writeWarcinfoRecord(ExperimentalWARCWriter writer)
127: throws IOException {
128: ANVLRecord meta = new ANVLRecord();
129: meta.addLabelValue("size", "1G");
130: meta.addLabelValue("operator", "igor");
131: byte[] bytes = meta.getUTF8Bytes();
132: writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,
133: new ByteArrayInputStream(bytes), bytes.length);
134: }
135:
136: protected void writeBasicRecords(final ExperimentalWARCWriter writer)
137: throws IOException {
138: ANVLRecord headerFields = new ANVLRecord();
139: headerFields.addLabelValue("x", "y");
140: headerFields.addLabelValue("a", "b");
141:
142: URI rid = null;
143: try {
144: rid = GeneratorFactory.getFactory().getQualifiedRecordID(
145: TYPE, METADATA);
146: } catch (URISyntaxException e) {
147: // Convert to IOE so can let it out.
148: throw new IOException(e.getMessage());
149: }
150: final String content = "Any old content.";
151: for (int i = 0; i < 10; i++) {
152: String body = i + ". " + content;
153: byte[] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
154: writer.writeRecord(METADATA, "http://www.archive.org/",
155: ArchiveUtils.get14DigitDate(), "no/type", rid,
156: headerFields, new ByteArrayInputStream(bodyBytes),
157: (long) bodyBytes.length);
158: }
159: }
160:
161: /**
162: * @return Generic HTML Content.
163: */
164: protected static String getContent() {
165: return getContent(null);
166: }
167:
168: /**
169: * @return Generic HTML Content with mention of passed <code>indexStr</code>
170: * in title and body.
171: */
172: protected static String getContent(String indexStr) {
173: String page = (indexStr != null) ? "Page #" + indexStr
174: : "Some Page";
175: return "HTTP/1.1 200 OK\r\n"
176: + "Content-Type: text/html\r\n\r\n"
177: + "<html><head><title>" + page + "</title></head>"
178: + "<body>" + page + "</body></html>";
179: }
180:
181: /**
182: * Write random HTML Record.
183: * @param w Where to write.
184: * @param index An index to put into content.
185: * @return Length of record written.
186: * @throws IOException
187: */
188: protected int writeRandomHTTPRecord(ExperimentalWARCWriter w,
189: int index) throws IOException {
190: ByteArrayOutputStream baos = new ByteArrayOutputStream();
191: String indexStr = Integer.toString(index);
192: byte[] record = (getContent(indexStr)).getBytes();
193: int recordLength = record.length;
194: baos.write(record);
195: // Add named fields for ip, checksum, and relate the metadata
196: // and request to the resource field.
197: ANVLRecord r = new ANVLRecord(1);
198: r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");
199: w.writeResourceRecord("http://www.one.net/id=" + indexStr,
200: ArchiveUtils.get14DigitDate(),
201: "text/html; charset=UTF-8", r,
202: new ByteArrayInputStream(baos.toByteArray()),
203: recordLength);
204: return recordLength;
205: }
206:
207: /**
208: * Fill a WARC with HTML Records.
209: * @param baseName WARC basename.
210: * @param compress Whether to compress or not.
211: * @param maxSize Maximum WARC size.
212: * @param recordCount How many records.
213: * @return The written file.
214: * @throws IOException
215: */
216: private File writeRecords(String baseName, boolean compress,
217: int maxSize, int recordCount) throws IOException {
218: cleanUpOldFiles(baseName);
219: File[] files = { getTmpDir() };
220: ExperimentalWARCWriter w = new ExperimentalWARCWriter(
221: SERIAL_NO, Arrays.asList(files), baseName + '-'
222: + PREFIX, "", compress, maxSize, null);
223: assertNotNull(w);
224: for (int i = 0; i < recordCount; i++) {
225: writeRandomHTTPRecord(w, i);
226: }
227: w.close();
228: assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(), w
229: .getFile().exists());
230: return w.getFile();
231: }
232:
233: /**
234: * Run validation of passed file.
235: * @param f File to validate.
236: * @param recordCount Expected count of records.
237: * @throws FileNotFoundException
238: * @throws IOException
239: */
240: private void validate(File f, int recordCount)
241: throws FileNotFoundException, IOException {
242: WARCReader reader = WARCReaderFactory.get(f);
243: assertNotNull(reader);
244: List headers = null;
245: if (recordCount == -1) {
246: headers = reader.validate();
247: } else {
248: headers = reader.validate(recordCount);
249: }
250: reader.close();
251:
252: // Now, run through each of the records doing absolute get going from
253: // the end to start. Reopen the arc so no context between this test
254: // and the previous.
255: reader = WARCReaderFactory.get(f);
256: for (int i = headers.size() - 1; i >= 0; i--) {
257: ArchiveRecordHeader h = (ArchiveRecordHeader) headers
258: .get(i);
259: ArchiveRecord r = reader.get(h.getOffset());
260: String mimeType = r.getHeader().getMimetype();
261: assertTrue("Record is bogus", mimeType != null
262: && mimeType.length() > 0);
263: }
264: reader.close();
265:
266: assertTrue("Metadatas not equal", headers.size() == recordCount);
267: for (Iterator i = headers.iterator(); i.hasNext();) {
268: ArchiveRecordHeader r = (ArchiveRecordHeader) i.next();
269: assertTrue("Record is empty", r.getLength() > 0);
270: }
271: }
272:
273: public void testWriteRecords() throws IOException {
274: final int recordCount = 2;
275: File f = writeRecords("writeRecord", false,
276: DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
277: validate(f, recordCount + 1); // Header record.
278: }
279:
280: public void testRandomAccess() throws IOException {
281: final int recordCount = 3;
282: File f = writeRecords("writeRecord", true,
283: DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
284: WARCReader reader = WARCReaderFactory.get(f);
285: // Get to second record. Get its offset for later use.
286: boolean readFirst = false;
287: String url = null;
288: long offset = -1;
289: long totalRecords = 0;
290: boolean readSecond = false;
291: for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
292: WARCRecord ar = (WARCRecord) i.next();
293: if (!readFirst) {
294: readFirst = true;
295: continue;
296: }
297: if (!readSecond) {
298: url = ar.getHeader().getUrl();
299: offset = ar.getHeader().getOffset();
300: readSecond = true;
301: }
302: }
303:
304: reader = WARCReaderFactory.get(f, offset);
305: ArchiveRecord ar = reader.get();
306: assertEquals(ar.getHeader().getUrl(), url);
307: ar.close();
308:
309: // Get reader again. See how iterator works with offset
310: reader = WARCReaderFactory.get(f, offset);
311: int count = 0;
312: for (final Iterator i = reader.iterator(); i.hasNext(); i
313: .next()) {
314: count++;
315: }
316: reader.close();
317: assertEquals(totalRecords - 1, count);
318: }
319:
320: public void testWriteRecordCompressed() throws IOException {
321: final int recordCount = 2;
322: File arcFile = writeRecords("writeRecordCompressed", true,
323: DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
324: validate(arcFile, recordCount + 1 /*Header record*/);
325: }
326:
327: protected ExperimentalWARCWriter createWARCWriter(String NAME,
328: boolean compress) {
329: File[] files = { getTmpDir() };
330: return new ExperimentalWARCWriter(SERIAL_NO, Arrays
331: .asList(files), NAME, "", compress,
332: DEFAULT_MAX_WARC_FILE_SIZE, null);
333: }
334:
335: protected static ByteArrayOutputStream getBaos(String str)
336: throws IOException {
337: ByteArrayOutputStream baos = new ByteArrayOutputStream();
338: baos.write(str.getBytes());
339: return baos;
340: }
341:
342: protected static void writeRecord(ExperimentalWARCWriter w,
343: String url, String mimetype, int len,
344: ByteArrayOutputStream baos) throws IOException {
345: w.writeResourceRecord(url, ArchiveUtils.get14DigitDate(),
346: mimetype, null, new ByteArrayInputStream(baos
347: .toByteArray()), len);
348: }
349:
350: protected int iterateRecords(WARCReader r) throws IOException {
351: int count = 0;
352: for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
353: ArchiveRecord ar = i.next();
354: ar.close();
355: if (count != 0) {
356: assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
357: ar.getHeader().getUrl().equals(SOME_URL));
358: }
359: count++;
360: }
361: return count;
362: }
363:
364: protected ExperimentalWARCWriter createWithOneRecord(String name,
365: boolean compressed) throws IOException {
366: ExperimentalWARCWriter writer = createWARCWriter(name,
367: compressed);
368: String content = getContent();
369: writeRecord(writer, SOME_URL, "text/html", content.length(),
370: getBaos(content));
371: return writer;
372: }
373:
374: public void testSpaceInURL() {
375: String eMessage = null;
376: try {
377: holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
378: } catch (IOException e) {
379: eMessage = e.getMessage();
380: }
381: assertTrue("Didn't get expected exception: " + eMessage,
382: eMessage.startsWith("Contains disallowed"));
383: }
384:
385: public void testTabInURL() {
386: String eMessage = null;
387: try {
388: holeyUrl("testTabInURL-" + PREFIX, false, "\t");
389: } catch (IOException e) {
390: eMessage = e.getMessage();
391: }
392: assertTrue("Didn't get expected exception: " + eMessage,
393: eMessage.startsWith("Contains illegal"));
394: }
395:
396: protected void holeyUrl(String name, boolean compress,
397: String urlInsert) throws IOException {
398: ExperimentalWARCWriter writer = createWithOneRecord(name,
399: compress);
400: // Add some bytes on the end to mess up the record.
401: String content = getContent();
402: ByteArrayOutputStream baos = getBaos(content);
403: writeRecord(writer, SOME_URL + urlInsert + "/index.html",
404: "text/html", content.length(), baos);
405: writer.close();
406: }
407:
408: /**
409: * Write an arc file for other tests to use.
410: * @param arcdir Directory to write to.
411: * @param compress True if file should be compressed.
412: * @return ARC written.
413: * @throws IOException
414: */
415: public static File createWARCFile(File arcdir, boolean compress)
416: throws IOException {
417: File[] files = { arcdir };
418: ExperimentalWARCWriter writer = new ExperimentalWARCWriter(
419: SERIAL_NO, Arrays.asList(files), "test", "", compress,
420: DEFAULT_MAX_WARC_FILE_SIZE, null);
421: String content = getContent();
422: writeRecord(writer, SOME_URL, "text/html", content.length(),
423: getBaos(content));
424: writer.close();
425: return writer.getFile();
426: }
427:
428: // public void testSpeed() throws IOException {
429: // ARCWriter writer = createArcWithOneRecord("speed", true);
430: // // Add a record with a length that is too long.
431: // String content = getContent();
432: // final int count = 100000;
433: // logger.info("Starting speed write of " + count + " records.");
434: // for (int i = 0; i < count; i++) {
435: // writeRecord(writer, SOME_URL, "text/html", content.length(),
436: // getBaos(content));
437: // }
438: // writer.close();
439: // logger.info("Finished speed write test.");
440: // }
441:
442: public void testArcRecordOffsetReads() throws Exception {
443: // Get an ARC with one record.
444: WriterPoolMember w = createWithOneRecord(
445: "testArcRecordInBufferStream", true);
446: w.close();
447: // Get reader on said ARC.
448: WARCReader r = WARCReaderFactory.get(w.getFile());
449: final Iterator<ArchiveRecord> i = r.iterator();
450: // Skip first ARC meta record.
451: ArchiveRecord ar = i.next();
452: i.hasNext();
453: // Now we're at first and only record in ARC.
454: ar = (WARCRecord) i.next();
455: // Now try getting some random set of bytes out of it
456: // at an odd offset (used to fail because we were
457: // doing bad math to find where in buffer to read).
458: final byte[] buffer = new byte[17];
459: final int maxRead = 4;
460: int totalRead = 0;
461: while (totalRead < maxRead) {
462: totalRead = totalRead
463: + ar.read(buffer, 13 + totalRead, maxRead
464: - totalRead);
465: assertTrue(totalRead > 0);
466: }
467: }
468: }
|