001: /* ARCWriterTest
002: *
003: * $Id: ARCWriterTest.java 5029 2007-03-29 23:53:50Z gojomo $
004: *
005: * Created on Dec 31, 2003.
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.io.arc;
026:
027: import java.io.ByteArrayOutputStream;
028: import java.io.File;
029: import java.io.FileNotFoundException;
030: import java.io.IOException;
031: import java.io.OutputStream;
032: import java.io.PrintStream;
033: import java.util.Arrays;
034: import java.util.Date;
035: import java.util.Iterator;
036: import java.util.List;
037: import java.util.concurrent.atomic.AtomicInteger;
038:
039: import org.archive.io.ArchiveRecord;
040: import org.archive.io.ReplayInputStream;
041: import org.archive.io.WriterPoolMember;
042: import org.archive.util.ArchiveUtils;
043: import org.archive.util.FileUtils;
044: import org.archive.util.TmpDirTestCase;
045:
046: /**
047: * Test ARCWriter class.
048: *
049: * This code exercises ARCWriter AND ARCReader. First it writes ARCs w/
050: * ARCWriter. Then it validates what was written w/ ARCReader.
051: *
052: * @author stack
053: */
054: public class ARCWriterTest extends TmpDirTestCase implements
055: ARCConstants {
056: /**
057: * Prefix to use for ARC files made by JUNIT.
058: */
059: private static final String PREFIX =
060: /* TODO DEFAULT_ARC_FILE_PREFIX*/"IAH";
061:
062: private static final String SOME_URL = "http://www.archive.org/test/";
063:
064: private static final AtomicInteger SERIAL_NO = new AtomicInteger();
065:
066: /*
067: * @see TestCase#setUp()
068: */
069: protected void setUp() throws Exception {
070: super .setUp();
071: }
072:
073: /*
074: * @see TestCase#tearDown()
075: */
076: protected void tearDown() throws Exception {
077: super .tearDown();
078: }
079:
080: protected static String getContent() {
081: return getContent(null);
082: }
083:
084: protected static String getContent(String indexStr) {
085: String page = (indexStr != null) ? "Page #" + indexStr
086: : "Some Page";
087: return "HTTP/1.1 200 OK\r\n"
088: + "Content-Type: text/html\r\n\r\n"
089: + "<html><head><title>" + page + "</title></head>"
090: + "<body>" + page + "</body></html>";
091: }
092:
093: protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
094: throws IOException {
095: String indexStr = Integer.toString(index);
096: ByteArrayOutputStream baos = new ByteArrayOutputStream();
097: // Start the record with an arbitrary 14-digit date per RFC2540
098: String now = ArchiveUtils.get14DigitDate();
099: int recordLength = 0;
100: byte[] record = (getContent(indexStr)).getBytes();
101: recordLength += record.length;
102: baos.write(record);
103: // Add the newline between records back in
104: baos.write("\n".getBytes());
105: recordLength += 1;
106: arcWriter.write("http://www.one.net/id=" + indexStr,
107: "text/html", "0.1.2.3", Long.parseLong(now),
108: recordLength, baos);
109: return recordLength;
110: }
111:
112: private File writeRecords(String baseName, boolean compress,
113: long maxSize, int recordCount) throws IOException {
114: cleanUpOldFiles(baseName);
115: File[] files = { getTmpDir() };
116: ARCWriter arcWriter = new ARCWriter(SERIAL_NO, Arrays
117: .asList(files), baseName + '-' + PREFIX, compress,
118: maxSize);
119: assertNotNull(arcWriter);
120: for (int i = 0; i < recordCount; i++) {
121: writeRandomHTTPRecord(arcWriter, i);
122: }
123: arcWriter.close();
124: assertTrue("Doesn't exist: "
125: + arcWriter.getFile().getAbsolutePath(), arcWriter
126: .getFile().exists());
127: return arcWriter.getFile();
128: }
129:
130: private void validate(File arcFile, int recordCount)
131: throws FileNotFoundException, IOException {
132: ARCReader reader = ARCReaderFactory.get(arcFile);
133: assertNotNull(reader);
134: List metaDatas = null;
135: if (recordCount == -1) {
136: metaDatas = reader.validate();
137: } else {
138: metaDatas = reader.validate(recordCount);
139: }
140: reader.close();
141: // Now, run through each of the records doing absolute get going from
142: // the end to start. Reopen the arc so no context between this test
143: // and the previous.
144: reader = ARCReaderFactory.get(arcFile);
145: for (int i = metaDatas.size() - 1; i >= 0; i--) {
146: ARCRecordMetaData meta = (ARCRecordMetaData) metaDatas
147: .get(i);
148: ArchiveRecord r = reader.get(meta.getOffset());
149: String mimeType = r.getHeader().getMimetype();
150: assertTrue("Record is bogus", mimeType != null
151: && mimeType.length() > 0);
152: }
153: reader.close();
154: assertTrue("Metadatas not equal",
155: metaDatas.size() == recordCount);
156: for (Iterator i = metaDatas.iterator(); i.hasNext();) {
157: ARCRecordMetaData r = (ARCRecordMetaData) i.next();
158: assertTrue("Record is empty", r.getLength() > 0);
159: }
160: }
161:
162: public void testCheckARCFileSize() throws IOException {
163: runCheckARCFileSizeTest("checkARCFileSize", false);
164: }
165:
166: public void testCheckARCFileSizeCompressed() throws IOException {
167: runCheckARCFileSizeTest("checkARCFileSize", true);
168: }
169:
170: public void testWriteRecord() throws IOException {
171: final int recordCount = 2;
172: File arcFile = writeRecords("writeRecord", false,
173: DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
174: validate(arcFile, recordCount + 1); // Header record.
175: }
176:
177: public void testRandomAccess() throws IOException {
178: final int recordCount = 3;
179: File arcFile = writeRecords("writeRecord", true,
180: DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
181: ARCReader reader = ARCReaderFactory.get(arcFile);
182: // Get to second record. Get its offset for later use.
183: boolean readFirst = false;
184: String url = null;
185: long offset = -1;
186: long totalRecords = 0;
187: boolean readSecond = false;
188: for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
189: ARCRecord ar = (ARCRecord) i.next();
190: if (!readFirst) {
191: readFirst = true;
192: continue;
193: }
194: if (!readSecond) {
195: url = ar.getMetaData().getUrl();
196: offset = ar.getMetaData().getOffset();
197: readSecond = true;
198: }
199: }
200:
201: reader = ARCReaderFactory.get(arcFile, offset);
202: ArchiveRecord ar = reader.get();
203: assertEquals(ar.getHeader().getUrl(), url);
204: ar.close();
205:
206: // Get reader again. See how iterator works with offset
207: reader = ARCReaderFactory.get(arcFile, offset);
208: int count = 0;
209: for (final Iterator i = reader.iterator(); i.hasNext(); i
210: .next()) {
211: count++;
212: }
213: reader.close();
214: assertEquals(totalRecords - 1, count);
215: }
216:
217: public void testWriteRecordCompressed() throws IOException {
218: final int recordCount = 2;
219: File arcFile = writeRecords("writeRecordCompressed", true,
220: DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
221: validate(arcFile, recordCount + 1 /*Header record*/);
222: }
223:
224: private void runCheckARCFileSizeTest(String baseName,
225: boolean compress) throws FileNotFoundException, IOException {
226: writeRecords(baseName, compress, 1024, 15);
227: // Now validate all files just created.
228: File[] files = FileUtils
229: .getFilesWithPrefix(getTmpDir(), PREFIX);
230: for (int i = 0; i < files.length; i++) {
231: validate(files[i], -1);
232: }
233: }
234:
235: protected ARCWriter createARCWriter(String NAME, boolean compress) {
236: File[] files = { getTmpDir() };
237: return new ARCWriter(SERIAL_NO, Arrays.asList(files), NAME,
238: compress, DEFAULT_MAX_ARC_FILE_SIZE);
239: }
240:
241: protected static ByteArrayOutputStream getBaos(String str)
242: throws IOException {
243: ByteArrayOutputStream baos = new ByteArrayOutputStream();
244: baos.write(str.getBytes());
245: return baos;
246: }
247:
248: protected static void writeRecord(ARCWriter writer, String url,
249: String type, int len, ByteArrayOutputStream baos)
250: throws IOException {
251: writer.write(url, type, "192.168.1.1", (new Date()).getTime(),
252: len, baos);
253: }
254:
255: protected int iterateRecords(ARCReader r) throws IOException {
256: int count = 0;
257: for (Iterator i = r.iterator(); i.hasNext();) {
258: ARCRecord rec = (ARCRecord) i.next();
259: rec.close();
260: if (count != 0) {
261: assertTrue("Unexpected URL "
262: + rec.getMetaData().getUrl(), rec.getMetaData()
263: .getUrl().equals(SOME_URL));
264: }
265: count++;
266: }
267: return count;
268: }
269:
270: protected ARCWriter createArcWithOneRecord(String name,
271: boolean compressed) throws IOException {
272: ARCWriter writer = createARCWriter(name, compressed);
273: String content = getContent();
274: writeRecord(writer, SOME_URL, "text/html", content.length(),
275: getBaos(content));
276: return writer;
277: }
278:
279: public void testSpaceInURL() {
280: String eMessage = null;
281: try {
282: holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
283: } catch (IOException e) {
284: eMessage = e.getMessage();
285: }
286: assertTrue("Didn't get expected exception: " + eMessage,
287: eMessage.startsWith("Metadata line doesn't match"));
288: }
289:
290: public void testTabInURL() {
291: String eMessage = null;
292: try {
293: holeyUrl("testTabInURL-" + PREFIX, false, "\t");
294: } catch (IOException e) {
295: eMessage = e.getMessage();
296: }
297: assertTrue("Didn't get expected exception: " + eMessage,
298: eMessage.startsWith("Metadata line doesn't match"));
299: }
300:
301: protected void holeyUrl(String name, boolean compress,
302: String urlInsert) throws IOException {
303: ARCWriter writer = createArcWithOneRecord(name, compress);
304: // Add some bytes on the end to mess up the record.
305: String content = getContent();
306: ByteArrayOutputStream baos = getBaos(content);
307: writeRecord(writer, SOME_URL + urlInsert + "/index.html",
308: "text/html", content.length(), baos);
309: writer.close();
310: }
311:
312: // If uncompressed, length has to be right or parse will fail.
313: //
314: // public void testLengthTooShort() throws IOException {
315: // lengthTooShort("testLengthTooShort-" + PREFIX, false);
316: // }
317:
318: public void testLengthTooShortCompressed() throws IOException {
319: lengthTooShort("testLengthTooShortCompressed-" + PREFIX, true,
320: false);
321: }
322:
323: public void testLengthTooShortCompressedStrict() throws IOException {
324: String eMessage = null;
325: try {
326: lengthTooShort("testLengthTooShortCompressedStrict-"
327: + PREFIX, true, true);
328: } catch (RuntimeException e) {
329: eMessage = e.getMessage();
330: }
331: assertTrue(
332: "Didn't get expected exception: " + eMessage,
333: eMessage
334: .startsWith("java.io.IOException: Record ENDING at"));
335: }
336:
337: protected void lengthTooShort(String name, boolean compress,
338: boolean strict) throws IOException {
339: ARCWriter writer = createArcWithOneRecord(name, compress);
340: // Add some bytes on the end to mess up the record.
341: String content = getContent();
342: ByteArrayOutputStream baos = getBaos(content);
343: baos.write("SOME TRAILING BYTES".getBytes());
344: writeRecord(writer, SOME_URL, "text/html", content.length(),
345: baos);
346: writeRecord(writer, SOME_URL, "text/html", content.length(),
347: getBaos(content));
348: writer.close();
349:
350: // Catch System.err into a byte stream.
351: ByteArrayOutputStream os = new ByteArrayOutputStream();
352: System.setErr(new PrintStream(os));
353:
354: ARCReader r = ARCReaderFactory.get(writer.getFile());
355: r.setStrict(strict);
356: int count = iterateRecords(r);
357: assertTrue("Count wrong " + count, count == 4);
358:
359: // Make sure we get the warning string which complains about the
360: // trailing bytes.
361: String err = os.toString();
362: assertTrue("No message " + err, err.startsWith("WARNING")
363: && (err.indexOf("Record ENDING at") > 0));
364: }
365:
366: // If uncompressed, length has to be right or parse will fail.
367: //
368: // public void testLengthTooLong()
369: // throws IOException {
370: // lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
371: // false, false);
372: // }
373:
374: public void testLengthTooLongCompressed() throws IOException {
375: lengthTooLong("testLengthTooLongCompressed-" + PREFIX, true,
376: false);
377: }
378:
379: public void testLengthTooLongCompressedStrict() {
380: String eMessage = null;
381: try {
382: lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
383: true, true);
384: } catch (IOException e) {
385: eMessage = e.getMessage();
386: }
387: assertTrue(
388: "Didn't get expected exception: " + eMessage,
389: eMessage
390: .startsWith("Premature EOF before end-of-record"));
391: }
392:
393: protected void lengthTooLong(String name, boolean compress,
394: boolean strict) throws IOException {
395: ARCWriter writer = createArcWithOneRecord(name, compress);
396: // Add a record with a length that is too long.
397: String content = getContent();
398: writeRecord(writer, SOME_URL, "text/html",
399: content.length() + 10, getBaos(content));
400: writeRecord(writer, SOME_URL, "text/html", content.length(),
401: getBaos(content));
402: writer.close();
403:
404: // Catch System.err.
405: ByteArrayOutputStream os = new ByteArrayOutputStream();
406: System.setErr(new PrintStream(os));
407:
408: ARCReader r = ARCReaderFactory.get(writer.getFile());
409: r.setStrict(strict);
410: int count = iterateRecords(r);
411: assertTrue("Count wrong " + count, count == 4);
412:
413: // Make sure we get the warning string which complains about the
414: // trailing bytes.
415: String err = os.toString();
416: assertTrue(
417: "No message " + err,
418: err
419: .startsWith("WARNING Premature EOF before end-of-record"));
420: }
421:
422: public void testGapError() throws IOException {
423: ARCWriter writer = createArcWithOneRecord("testGapError", true);
424: String content = getContent();
425: // Make a 'weird' RIS that returns bad 'remaining' length
426: // after the call to readFullyTo.
427: ReplayInputStream ris = new ReplayInputStream(content
428: .getBytes(), content.length(), null) {
429: private boolean readFullyToCalled = false;
430:
431: public void readFullyTo(OutputStream os) throws IOException {
432: super .readFullyTo(os);
433: this .readFullyToCalled = true;
434: }
435:
436: public long remaining() {
437: return (this .readFullyToCalled) ? -1 : super
438: .remaining();
439: }
440: };
441: String message = null;
442: try {
443: writer.write(SOME_URL, "text/html", "192.168.1.1",
444: (new Date()).getTime(), content.length(), ris);
445: } catch (IOException e) {
446: message = e.getMessage();
447: }
448: writer.close();
449: assertTrue(
450: "No gap when should be",
451: message != null
452: && message
453: .indexOf("Gap between expected and actual") >= 0);
454: }
455:
456: /**
457: * Write an arc file for other tests to use.
458: * @param arcdir Directory to write to.
459: * @param compress True if file should be compressed.
460: * @return ARC written.
461: * @throws IOException
462: */
463: public static File createARCFile(File arcdir, boolean compress)
464: throws IOException {
465: File[] files = { arcdir };
466: ARCWriter writer = new ARCWriter(SERIAL_NO, Arrays
467: .asList(files), "test", compress,
468: DEFAULT_MAX_ARC_FILE_SIZE);
469: String content = getContent();
470: writeRecord(writer, SOME_URL, "text/html", content.length(),
471: getBaos(content));
472: writer.close();
473: return writer.getFile();
474: }
475:
476: // public void testSpeed() throws IOException {
477: // ARCWriter writer = createArcWithOneRecord("speed", true);
478: // // Add a record with a length that is too long.
479: // String content = getContent();
480: // final int count = 100000;
481: // logger.info("Starting speed write of " + count + " records.");
482: // for (int i = 0; i < count; i++) {
483: // writeRecord(writer, SOME_URL, "text/html", content.length(),
484: // getBaos(content));
485: // }
486: // writer.close();
487: // logger.info("Finished speed write test.");
488: // }
489:
490: public void testValidateMetaLine() throws Exception {
491: final String line = "http://www.aandw.net/images/walden2.png "
492: + "128.197.34.86 20060111174224 image/png 2160";
493: ARCWriter w = createARCWriter("testValidateMetaLine", true);
494: try {
495: w.validateMetaLine(line);
496: w.validateMetaLine(line + LINE_SEPARATOR);
497: w.validateMetaLine(line + "\\r\\n");
498: } finally {
499: w.close();
500: }
501: }
502:
503: public void testArcRecordOffsetReads() throws Exception {
504: // Get an ARC with one record.
505: WriterPoolMember w = createArcWithOneRecord(
506: "testArcRecordInBufferStream", true);
507: w.close();
508: // Get reader on said ARC.
509: ARCReader r = ARCReaderFactory.get(w.getFile());
510: final Iterator i = r.iterator();
511: // Skip first ARC meta record.
512: ARCRecord ar = (ARCRecord) i.next();
513: i.hasNext();
514: // Now we're at first and only record in ARC.
515: ar = (ARCRecord) i.next();
516: // Now try getting some random set of bytes out of it
517: // at an odd offset (used to fail because we were
518: // doing bad math to find where in buffer to read).
519: final byte[] buffer = new byte[17];
520: final int maxRead = 4;
521: int totalRead = 0;
522: while (totalRead < maxRead) {
523: totalRead = totalRead
524: + ar.read(buffer, 13 + totalRead, maxRead
525: - totalRead);
526: assertTrue(totalRead > 0);
527: }
528: }
529: }
|