001: /*
002: * ExperimentalWARCWriterTest
003: *
004: * $Id: ExperimentalWARCWriterTest.java 4554 2006-08-30 02:35:48Z stack-sf $
005: *
006: * Created on July 27th, 2006
007: *
008: * Copyright (C) 2006 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026: package org.archive.io.warc.v10;
027:
028: import java.io.ByteArrayInputStream;
029: import java.io.ByteArrayOutputStream;
030: import java.io.File;
031: import java.io.FileNotFoundException;
032: import java.io.IOException;
033: import java.net.URI;
034: import java.net.URISyntaxException;
035: import java.util.Arrays;
036: import java.util.Iterator;
037: import java.util.List;
038: import java.util.concurrent.atomic.AtomicInteger;
039:
040: import org.archive.io.ArchiveRecord;
041: import org.archive.io.ArchiveRecordHeader;
042: import org.archive.io.UTF8Bytes;
043: import org.archive.io.WriterPoolMember;
044: import org.archive.io.warc.WARCConstants;
045: import org.archive.uid.GeneratorFactory;
046: import org.archive.util.ArchiveUtils;
047: import org.archive.util.TmpDirTestCase;
048: import org.archive.util.anvl.ANVLRecord;
049:
050: /**
051: * Test Writer and Reader.
052: * @author stack
053: * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
054: */
055: public class ExperimentalWARCWriterTest extends TmpDirTestCase
056: implements WARCConstants {
057: private static final AtomicInteger SERIAL_NO = new AtomicInteger();
058:
059: /**
060: * Prefix to use for ARC files made by JUNIT.
061: */
062: private static final String PREFIX = "IAH";
063:
064: private static final String SOME_URL = "http://www.archive.org/test/";
065:
066: public void testCheckHeaderLineValue() throws Exception {
067: ExperimentalWARCWriter writer = new ExperimentalWARCWriter();
068: writer.checkHeaderLineParameters("one");
069: IOException exception = null;
070: try {
071: writer.checkHeaderLineParameters("with space");
072: } catch (IOException e) {
073: exception = e;
074: }
075: assertNotNull(exception);
076: exception = null;
077: try {
078: writer
079: .checkHeaderLineParameters("with\0x0000controlcharacter");
080: } catch (IOException e) {
081: exception = e;
082: }
083: assertNotNull(exception);
084: }
085:
086: public void testMimetypes() throws IOException {
087: ExperimentalWARCWriter writer = new ExperimentalWARCWriter();
088: writer.checkHeaderLineMimetypeParameter("text/xml");
089: writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
090: writer
091: .checkHeaderLineMimetypeParameter("text/plain; charset=SHIFT-JIS");
092: System.out
093: .println(writer
094: .checkHeaderLineMimetypeParameter("multipart/mixed; \r\n boundary=\"simple boundary\""));
095: }
096:
097: public void testWriteRecord() throws IOException {
098: File[] files = { getTmpDir() };
099:
100: // Write uncompressed.
101: ExperimentalWARCWriter writer = new ExperimentalWARCWriter(
102: SERIAL_NO, Arrays.asList(files), this .getClass()
103: .getName(), "suffix", false, -1, null);
104: writeFile(writer);
105:
106: // Write compressed.
107: writer = new ExperimentalWARCWriter(SERIAL_NO, Arrays
108: .asList(files), this .getClass().getName(), "suffix",
109: true, -1, null);
110: writeFile(writer);
111: }
112:
113: private void writeFile(final ExperimentalWARCWriter writer)
114: throws IOException {
115: try {
116: writeWarcinfoRecord(writer);
117: writeBasicRecords(writer);
118: } finally {
119: writer.close();
120: writer.getFile().delete();
121: }
122: }
123:
124: private void writeWarcinfoRecord(ExperimentalWARCWriter writer)
125: throws IOException {
126: ANVLRecord meta = new ANVLRecord();
127: meta.addLabelValue("size", "1G");
128: meta.addLabelValue("operator", "igor");
129: byte[] bytes = meta.getUTF8Bytes();
130: writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,
131: new ByteArrayInputStream(bytes), bytes.length);
132: }
133:
134: protected void writeBasicRecords(final ExperimentalWARCWriter writer)
135: throws IOException {
136: ANVLRecord headerFields = new ANVLRecord();
137: headerFields.addLabelValue("x", "y");
138: headerFields.addLabelValue("a", "b");
139:
140: URI rid = null;
141: try {
142: rid = GeneratorFactory.getFactory().getQualifiedRecordID(
143: TYPE, METADATA);
144: } catch (URISyntaxException e) {
145: // Convert to IOE so can let it out.
146: throw new IOException(e.getMessage());
147: }
148: final String content = "Any old content.";
149: for (int i = 0; i < 10; i++) {
150: String body = i + ". " + content;
151: byte[] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
152: writer.writeRecord(METADATA, "http://www.archive.org/",
153: ArchiveUtils.get14DigitDate(), "no/type", rid,
154: headerFields, new ByteArrayInputStream(bodyBytes),
155: (long) bodyBytes.length);
156: }
157: }
158:
159: /**
160: * @return Generic HTML Content.
161: */
162: protected static String getContent() {
163: return getContent(null);
164: }
165:
166: /**
167: * @return Generic HTML Content with mention of passed <code>indexStr</code>
168: * in title and body.
169: */
170: protected static String getContent(String indexStr) {
171: String page = (indexStr != null) ? "Page #" + indexStr
172: : "Some Page";
173: return "HTTP/1.1 200 OK\r\n"
174: + "Content-Type: text/html\r\n\r\n"
175: + "<html><head><title>" + page + "</title></head>"
176: + "<body>" + page + "</body></html>";
177: }
178:
179: /**
180: * Write random HTML Record.
181: * @param w Where to write.
182: * @param index An index to put into content.
183: * @return Length of record written.
184: * @throws IOException
185: */
186: protected int writeRandomHTTPRecord(ExperimentalWARCWriter w,
187: int index) throws IOException {
188: ByteArrayOutputStream baos = new ByteArrayOutputStream();
189: String indexStr = Integer.toString(index);
190: byte[] record = (getContent(indexStr)).getBytes();
191: int recordLength = record.length;
192: baos.write(record);
193: // Add named fields for ip, checksum, and relate the metadata
194: // and request to the resource field.
195: ANVLRecord r = new ANVLRecord(1);
196: r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");
197: w.writeResourceRecord("http://www.one.net/id=" + indexStr,
198: ArchiveUtils.get14DigitDate(),
199: "text/html; charset=UTF-8", r,
200: new ByteArrayInputStream(baos.toByteArray()),
201: recordLength);
202: return recordLength;
203: }
204:
205: /**
206: * Fill a WARC with HTML Records.
207: * @param baseName WARC basename.
208: * @param compress Whether to compress or not.
209: * @param maxSize Maximum WARC size.
210: * @param recordCount How many records.
211: * @return The written file.
212: * @throws IOException
213: */
214: private File writeRecords(String baseName, boolean compress,
215: int maxSize, int recordCount) throws IOException {
216: cleanUpOldFiles(baseName);
217: File[] files = { getTmpDir() };
218: ExperimentalWARCWriter w = new ExperimentalWARCWriter(
219: SERIAL_NO, Arrays.asList(files), baseName + '-'
220: + PREFIX, "", compress, maxSize, null);
221: assertNotNull(w);
222: for (int i = 0; i < recordCount; i++) {
223: writeRandomHTTPRecord(w, i);
224: }
225: w.close();
226: assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(), w
227: .getFile().exists());
228: return w.getFile();
229: }
230:
231: /**
232: * Run validation of passed file.
233: * @param f File to validate.
234: * @param recordCount Expected count of records.
235: * @throws FileNotFoundException
236: * @throws IOException
237: */
238: private void validate(File f, int recordCount)
239: throws FileNotFoundException, IOException {
240: WARCReader reader = WARCReaderFactory.get(f);
241: assertNotNull(reader);
242: List headers = null;
243: if (recordCount == -1) {
244: headers = reader.validate();
245: } else {
246: headers = reader.validate(recordCount);
247: }
248: reader.close();
249:
250: // Now, run through each of the records doing absolute get going from
251: // the end to start. Reopen the arc so no context between this test
252: // and the previous.
253: reader = WARCReaderFactory.get(f);
254: for (int i = headers.size() - 1; i >= 0; i--) {
255: ArchiveRecordHeader h = (ArchiveRecordHeader) headers
256: .get(i);
257: ArchiveRecord r = reader.get(h.getOffset());
258: String mimeType = r.getHeader().getMimetype();
259: assertTrue("Record is bogus", mimeType != null
260: && mimeType.length() > 0);
261: }
262: reader.close();
263:
264: assertTrue("Metadatas not equal", headers.size() == recordCount);
265: for (Iterator i = headers.iterator(); i.hasNext();) {
266: ArchiveRecordHeader r = (ArchiveRecordHeader) i.next();
267: assertTrue("Record is empty", r.getLength() > 0);
268: }
269: }
270:
271: public void testWriteRecords() throws IOException {
272: final int recordCount = 2;
273: File f = writeRecords("writeRecord", false,
274: DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
275: validate(f, recordCount + 1); // Header record.
276: }
277:
278: public void testRandomAccess() throws IOException {
279: final int recordCount = 3;
280: File f = writeRecords("writeRecord", true,
281: DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
282: WARCReader reader = WARCReaderFactory.get(f);
283: // Get to second record. Get its offset for later use.
284: boolean readFirst = false;
285: String url = null;
286: long offset = -1;
287: long totalRecords = 0;
288: boolean readSecond = false;
289: for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
290: WARCRecord ar = (WARCRecord) i.next();
291: if (!readFirst) {
292: readFirst = true;
293: continue;
294: }
295: if (!readSecond) {
296: url = ar.getHeader().getUrl();
297: offset = ar.getHeader().getOffset();
298: readSecond = true;
299: }
300: }
301:
302: reader = WARCReaderFactory.get(f, offset);
303: ArchiveRecord ar = reader.get();
304: assertEquals(ar.getHeader().getUrl(), url);
305: ar.close();
306:
307: // Get reader again. See how iterator works with offset
308: reader = WARCReaderFactory.get(f, offset);
309: int count = 0;
310: for (final Iterator i = reader.iterator(); i.hasNext(); i
311: .next()) {
312: count++;
313: }
314: reader.close();
315: assertEquals(totalRecords - 1, count);
316: }
317:
318: public void testWriteRecordCompressed() throws IOException {
319: final int recordCount = 2;
320: File arcFile = writeRecords("writeRecordCompressed", true,
321: DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
322: validate(arcFile, recordCount + 1 /*Header record*/);
323: }
324:
325: protected ExperimentalWARCWriter createWARCWriter(String NAME,
326: boolean compress) {
327: File[] files = { getTmpDir() };
328: return new ExperimentalWARCWriter(SERIAL_NO, Arrays
329: .asList(files), NAME, "", compress,
330: DEFAULT_MAX_WARC_FILE_SIZE, null);
331: }
332:
333: protected static ByteArrayOutputStream getBaos(String str)
334: throws IOException {
335: ByteArrayOutputStream baos = new ByteArrayOutputStream();
336: baos.write(str.getBytes());
337: return baos;
338: }
339:
340: protected static void writeRecord(ExperimentalWARCWriter w,
341: String url, String mimetype, int len,
342: ByteArrayOutputStream baos) throws IOException {
343: w.writeResourceRecord(url, ArchiveUtils.get14DigitDate(),
344: mimetype, null, new ByteArrayInputStream(baos
345: .toByteArray()), len);
346: }
347:
348: protected int iterateRecords(WARCReader r) throws IOException {
349: int count = 0;
350: for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
351: ArchiveRecord ar = i.next();
352: ar.close();
353: if (count != 0) {
354: assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
355: ar.getHeader().getUrl().equals(SOME_URL));
356: }
357: count++;
358: }
359: return count;
360: }
361:
362: protected ExperimentalWARCWriter createWithOneRecord(String name,
363: boolean compressed) throws IOException {
364: ExperimentalWARCWriter writer = createWARCWriter(name,
365: compressed);
366: String content = getContent();
367: writeRecord(writer, SOME_URL, "text/html", content.length(),
368: getBaos(content));
369: return writer;
370: }
371:
372: public void testSpaceInURL() {
373: String eMessage = null;
374: try {
375: holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
376: } catch (IOException e) {
377: eMessage = e.getMessage();
378: }
379: assertTrue("Didn't get expected exception: " + eMessage,
380: eMessage.startsWith("Contains disallowed"));
381: }
382:
383: public void testTabInURL() {
384: String eMessage = null;
385: try {
386: holeyUrl("testTabInURL-" + PREFIX, false, "\t");
387: } catch (IOException e) {
388: eMessage = e.getMessage();
389: }
390: assertTrue("Didn't get expected exception: " + eMessage,
391: eMessage.startsWith("Contains illegal"));
392: }
393:
394: protected void holeyUrl(String name, boolean compress,
395: String urlInsert) throws IOException {
396: ExperimentalWARCWriter writer = createWithOneRecord(name,
397: compress);
398: // Add some bytes on the end to mess up the record.
399: String content = getContent();
400: ByteArrayOutputStream baos = getBaos(content);
401: writeRecord(writer, SOME_URL + urlInsert + "/index.html",
402: "text/html", content.length(), baos);
403: writer.close();
404: }
405:
406: /**
407: * Write an arc file for other tests to use.
408: * @param arcdir Directory to write to.
409: * @param compress True if file should be compressed.
410: * @return ARC written.
411: * @throws IOException
412: */
413: public static File createWARCFile(File arcdir, boolean compress)
414: throws IOException {
415: File[] files = { arcdir };
416: ExperimentalWARCWriter writer = new ExperimentalWARCWriter(
417: SERIAL_NO, Arrays.asList(files), "test", "", compress,
418: DEFAULT_MAX_WARC_FILE_SIZE, null);
419: String content = getContent();
420: writeRecord(writer, SOME_URL, "text/html", content.length(),
421: getBaos(content));
422: writer.close();
423: return writer.getFile();
424: }
425:
426: // public void testSpeed() throws IOException {
427: // ARCWriter writer = createArcWithOneRecord("speed", true);
428: // // Add a record with a length that is too long.
429: // String content = getContent();
430: // final int count = 100000;
431: // logger.info("Starting speed write of " + count + " records.");
432: // for (int i = 0; i < count; i++) {
433: // writeRecord(writer, SOME_URL, "text/html", content.length(),
434: // getBaos(content));
435: // }
436: // writer.close();
437: // logger.info("Finished speed write test.");
438: // }
439:
440: public void testArcRecordOffsetReads() throws Exception {
441: // Get an ARC with one record.
442: WriterPoolMember w = createWithOneRecord(
443: "testArcRecordInBufferStream", true);
444: w.close();
445: // Get reader on said ARC.
446: WARCReader r = WARCReaderFactory.get(w.getFile());
447: final Iterator<ArchiveRecord> i = r.iterator();
448: // Skip first ARC meta record.
449: ArchiveRecord ar = i.next();
450: i.hasNext();
451: // Now we're at first and only record in ARC.
452: ar = (WARCRecord) i.next();
453: // Now try getting some random set of bytes out of it
454: // at an odd offset (used to fail because we were
455: // doing bad math to find where in buffer to read).
456: final byte[] buffer = new byte[17];
457: final int maxRead = 4;
458: int totalRead = 0;
459: while (totalRead < maxRead) {
460: totalRead = totalRead
461: + ar.read(buffer, 13 + totalRead, maxRead
462: - totalRead);
463: assertTrue(totalRead > 0);
464: }
465: }
466: }
|