Source Code Cross Referenced for ARCWriterTest.java in  » Web-Crawler » heritrix » org » archive » io » arc » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.io.arc 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* ARCWriterTest
002:         *
003:         * $Id: ARCWriterTest.java 5029 2007-03-29 23:53:50Z gojomo $
004:         *
005:         * Created on Dec 31, 2003.
006:         *
007:         * Copyright (C) 2003 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.io.arc;
026:
027:        import java.io.ByteArrayOutputStream;
028:        import java.io.File;
029:        import java.io.FileNotFoundException;
030:        import java.io.IOException;
031:        import java.io.OutputStream;
032:        import java.io.PrintStream;
033:        import java.util.Arrays;
034:        import java.util.Date;
035:        import java.util.Iterator;
036:        import java.util.List;
037:        import java.util.concurrent.atomic.AtomicInteger;
038:
039:        import org.archive.io.ArchiveRecord;
040:        import org.archive.io.ReplayInputStream;
041:        import org.archive.io.WriterPoolMember;
042:        import org.archive.util.ArchiveUtils;
043:        import org.archive.util.FileUtils;
044:        import org.archive.util.TmpDirTestCase;
045:
046:        /**
047:         * Test ARCWriter class.
048:         *
049:         * This code exercises ARCWriter AND ARCReader.  First it writes ARCs w/
050:         * ARCWriter.  Then it validates what was written w/ ARCReader.
051:         *
052:         * @author stack
053:         */
054:        public class ARCWriterTest extends TmpDirTestCase implements 
055:                ARCConstants {
056:            /**
057:             * Prefix to use for ARC files made by JUNIT.
058:             */
059:            private static final String PREFIX =
060:            /* TODO DEFAULT_ARC_FILE_PREFIX*/"IAH";
061:
062:            private static final String SOME_URL = "http://www.archive.org/test/";
063:
064:            private static final AtomicInteger SERIAL_NO = new AtomicInteger();
065:
066:            /*
067:             * @see TestCase#setUp()
068:             */
069:            protected void setUp() throws Exception {
070:                super .setUp();
071:            }
072:
073:            /*
074:             * @see TestCase#tearDown()
075:             */
076:            protected void tearDown() throws Exception {
077:                super .tearDown();
078:            }
079:
080:            protected static String getContent() {
081:                return getContent(null);
082:            }
083:
084:            protected static String getContent(String indexStr) {
085:                String page = (indexStr != null) ? "Page #" + indexStr
086:                        : "Some Page";
087:                return "HTTP/1.1 200 OK\r\n"
088:                        + "Content-Type: text/html\r\n\r\n"
089:                        + "<html><head><title>" + page + "</title></head>"
090:                        + "<body>" + page + "</body></html>";
091:            }
092:
093:            protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
094:                    throws IOException {
095:                String indexStr = Integer.toString(index);
096:                ByteArrayOutputStream baos = new ByteArrayOutputStream();
097:                // Start the record with an arbitrary 14-digit date per RFC2540
098:                String now = ArchiveUtils.get14DigitDate();
099:                int recordLength = 0;
100:                byte[] record = (getContent(indexStr)).getBytes();
101:                recordLength += record.length;
102:                baos.write(record);
103:                // Add the newline between records back in
104:                baos.write("\n".getBytes());
105:                recordLength += 1;
106:                arcWriter.write("http://www.one.net/id=" + indexStr,
107:                        "text/html", "0.1.2.3", Long.parseLong(now),
108:                        recordLength, baos);
109:                return recordLength;
110:            }
111:
112:            private File writeRecords(String baseName, boolean compress,
113:                    long maxSize, int recordCount) throws IOException {
114:                cleanUpOldFiles(baseName);
115:                File[] files = { getTmpDir() };
116:                ARCWriter arcWriter = new ARCWriter(SERIAL_NO, Arrays
117:                        .asList(files), baseName + '-' + PREFIX, compress,
118:                        maxSize);
119:                assertNotNull(arcWriter);
120:                for (int i = 0; i < recordCount; i++) {
121:                    writeRandomHTTPRecord(arcWriter, i);
122:                }
123:                arcWriter.close();
124:                assertTrue("Doesn't exist: "
125:                        + arcWriter.getFile().getAbsolutePath(), arcWriter
126:                        .getFile().exists());
127:                return arcWriter.getFile();
128:            }
129:
130:            private void validate(File arcFile, int recordCount)
131:                    throws FileNotFoundException, IOException {
132:                ARCReader reader = ARCReaderFactory.get(arcFile);
133:                assertNotNull(reader);
134:                List metaDatas = null;
135:                if (recordCount == -1) {
136:                    metaDatas = reader.validate();
137:                } else {
138:                    metaDatas = reader.validate(recordCount);
139:                }
140:                reader.close();
141:                // Now, run through each of the records doing absolute get going from
142:                // the end to start.  Reopen the arc so no context between this test
143:                // and the previous.
144:                reader = ARCReaderFactory.get(arcFile);
145:                for (int i = metaDatas.size() - 1; i >= 0; i--) {
146:                    ARCRecordMetaData meta = (ARCRecordMetaData) metaDatas
147:                            .get(i);
148:                    ArchiveRecord r = reader.get(meta.getOffset());
149:                    String mimeType = r.getHeader().getMimetype();
150:                    assertTrue("Record is bogus", mimeType != null
151:                            && mimeType.length() > 0);
152:                }
153:                reader.close();
154:                assertTrue("Metadatas not equal",
155:                        metaDatas.size() == recordCount);
156:                for (Iterator i = metaDatas.iterator(); i.hasNext();) {
157:                    ARCRecordMetaData r = (ARCRecordMetaData) i.next();
158:                    assertTrue("Record is empty", r.getLength() > 0);
159:                }
160:            }
161:
162:            public void testCheckARCFileSize() throws IOException {
163:                runCheckARCFileSizeTest("checkARCFileSize", false);
164:            }
165:
166:            public void testCheckARCFileSizeCompressed() throws IOException {
167:                runCheckARCFileSizeTest("checkARCFileSize", true);
168:            }
169:
170:            public void testWriteRecord() throws IOException {
171:                final int recordCount = 2;
172:                File arcFile = writeRecords("writeRecord", false,
173:                        DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
174:                validate(arcFile, recordCount + 1); // Header record.
175:            }
176:
177:            public void testRandomAccess() throws IOException {
178:                final int recordCount = 3;
179:                File arcFile = writeRecords("writeRecord", true,
180:                        DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
181:                ARCReader reader = ARCReaderFactory.get(arcFile);
182:                // Get to second record.  Get its offset for later use.
183:                boolean readFirst = false;
184:                String url = null;
185:                long offset = -1;
186:                long totalRecords = 0;
187:                boolean readSecond = false;
188:                for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
189:                    ARCRecord ar = (ARCRecord) i.next();
190:                    if (!readFirst) {
191:                        readFirst = true;
192:                        continue;
193:                    }
194:                    if (!readSecond) {
195:                        url = ar.getMetaData().getUrl();
196:                        offset = ar.getMetaData().getOffset();
197:                        readSecond = true;
198:                    }
199:                }
200:
201:                reader = ARCReaderFactory.get(arcFile, offset);
202:                ArchiveRecord ar = reader.get();
203:                assertEquals(ar.getHeader().getUrl(), url);
204:                ar.close();
205:
206:                // Get reader again.  See how iterator works with offset
207:                reader = ARCReaderFactory.get(arcFile, offset);
208:                int count = 0;
209:                for (final Iterator i = reader.iterator(); i.hasNext(); i
210:                        .next()) {
211:                    count++;
212:                }
213:                reader.close();
214:                assertEquals(totalRecords - 1, count);
215:            }
216:
217:            public void testWriteRecordCompressed() throws IOException {
218:                final int recordCount = 2;
219:                File arcFile = writeRecords("writeRecordCompressed", true,
220:                        DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
221:                validate(arcFile, recordCount + 1 /*Header record*/);
222:            }
223:
224:            private void runCheckARCFileSizeTest(String baseName,
225:                    boolean compress) throws FileNotFoundException, IOException {
226:                writeRecords(baseName, compress, 1024, 15);
227:                // Now validate all files just created.
228:                File[] files = FileUtils
229:                        .getFilesWithPrefix(getTmpDir(), PREFIX);
230:                for (int i = 0; i < files.length; i++) {
231:                    validate(files[i], -1);
232:                }
233:            }
234:
235:            protected ARCWriter createARCWriter(String NAME, boolean compress) {
236:                File[] files = { getTmpDir() };
237:                return new ARCWriter(SERIAL_NO, Arrays.asList(files), NAME,
238:                        compress, DEFAULT_MAX_ARC_FILE_SIZE);
239:            }
240:
241:            protected static ByteArrayOutputStream getBaos(String str)
242:                    throws IOException {
243:                ByteArrayOutputStream baos = new ByteArrayOutputStream();
244:                baos.write(str.getBytes());
245:                return baos;
246:            }
247:
248:            protected static void writeRecord(ARCWriter writer, String url,
249:                    String type, int len, ByteArrayOutputStream baos)
250:                    throws IOException {
251:                writer.write(url, type, "192.168.1.1", (new Date()).getTime(),
252:                        len, baos);
253:            }
254:
255:            protected int iterateRecords(ARCReader r) throws IOException {
256:                int count = 0;
257:                for (Iterator i = r.iterator(); i.hasNext();) {
258:                    ARCRecord rec = (ARCRecord) i.next();
259:                    rec.close();
260:                    if (count != 0) {
261:                        assertTrue("Unexpected URL "
262:                                + rec.getMetaData().getUrl(), rec.getMetaData()
263:                                .getUrl().equals(SOME_URL));
264:                    }
265:                    count++;
266:                }
267:                return count;
268:            }
269:
270:            protected ARCWriter createArcWithOneRecord(String name,
271:                    boolean compressed) throws IOException {
272:                ARCWriter writer = createARCWriter(name, compressed);
273:                String content = getContent();
274:                writeRecord(writer, SOME_URL, "text/html", content.length(),
275:                        getBaos(content));
276:                return writer;
277:            }
278:
279:            public void testSpaceInURL() {
280:                String eMessage = null;
281:                try {
282:                    holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
283:                } catch (IOException e) {
284:                    eMessage = e.getMessage();
285:                }
286:                assertTrue("Didn't get expected exception: " + eMessage,
287:                        eMessage.startsWith("Metadata line doesn't match"));
288:            }
289:
290:            public void testTabInURL() {
291:                String eMessage = null;
292:                try {
293:                    holeyUrl("testTabInURL-" + PREFIX, false, "\t");
294:                } catch (IOException e) {
295:                    eMessage = e.getMessage();
296:                }
297:                assertTrue("Didn't get expected exception: " + eMessage,
298:                        eMessage.startsWith("Metadata line doesn't match"));
299:            }
300:
301:            protected void holeyUrl(String name, boolean compress,
302:                    String urlInsert) throws IOException {
303:                ARCWriter writer = createArcWithOneRecord(name, compress);
304:                // Add some bytes on the end to mess up the record.
305:                String content = getContent();
306:                ByteArrayOutputStream baos = getBaos(content);
307:                writeRecord(writer, SOME_URL + urlInsert + "/index.html",
308:                        "text/html", content.length(), baos);
309:                writer.close();
310:            }
311:
312:            // If uncompressed, length has to be right or parse will fail.
313:            //
314:            //    public void testLengthTooShort() throws IOException {
315:            //        lengthTooShort("testLengthTooShort-" + PREFIX, false);
316:            //    }
317:
318:            public void testLengthTooShortCompressed() throws IOException {
319:                lengthTooShort("testLengthTooShortCompressed-" + PREFIX, true,
320:                        false);
321:            }
322:
323:            public void testLengthTooShortCompressedStrict() throws IOException {
324:                String eMessage = null;
325:                try {
326:                    lengthTooShort("testLengthTooShortCompressedStrict-"
327:                            + PREFIX, true, true);
328:                } catch (RuntimeException e) {
329:                    eMessage = e.getMessage();
330:                }
331:                assertTrue(
332:                        "Didn't get expected exception: " + eMessage,
333:                        eMessage
334:                                .startsWith("java.io.IOException: Record ENDING at"));
335:            }
336:
337:            protected void lengthTooShort(String name, boolean compress,
338:                    boolean strict) throws IOException {
339:                ARCWriter writer = createArcWithOneRecord(name, compress);
340:                // Add some bytes on the end to mess up the record.
341:                String content = getContent();
342:                ByteArrayOutputStream baos = getBaos(content);
343:                baos.write("SOME TRAILING BYTES".getBytes());
344:                writeRecord(writer, SOME_URL, "text/html", content.length(),
345:                        baos);
346:                writeRecord(writer, SOME_URL, "text/html", content.length(),
347:                        getBaos(content));
348:                writer.close();
349:
350:                // Catch System.err into a byte stream.
351:                ByteArrayOutputStream os = new ByteArrayOutputStream();
352:                System.setErr(new PrintStream(os));
353:
354:                ARCReader r = ARCReaderFactory.get(writer.getFile());
355:                r.setStrict(strict);
356:                int count = iterateRecords(r);
357:                assertTrue("Count wrong " + count, count == 4);
358:
359:                // Make sure we get the warning string which complains about the
360:                // trailing bytes.
361:                String err = os.toString();
362:                assertTrue("No message " + err, err.startsWith("WARNING")
363:                        && (err.indexOf("Record ENDING at") > 0));
364:            }
365:
366:            //  If uncompressed, length has to be right or parse will fail.
367:            //
368:            //    public void testLengthTooLong()
369:            //    throws IOException {
370:            //        lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
371:            //            false, false);
372:            //    }
373:
374:            public void testLengthTooLongCompressed() throws IOException {
375:                lengthTooLong("testLengthTooLongCompressed-" + PREFIX, true,
376:                        false);
377:            }
378:
379:            public void testLengthTooLongCompressedStrict() {
380:                String eMessage = null;
381:                try {
382:                    lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
383:                            true, true);
384:                } catch (IOException e) {
385:                    eMessage = e.getMessage();
386:                }
387:                assertTrue(
388:                        "Didn't get expected exception: " + eMessage,
389:                        eMessage
390:                                .startsWith("Premature EOF before end-of-record"));
391:            }
392:
393:            protected void lengthTooLong(String name, boolean compress,
394:                    boolean strict) throws IOException {
395:                ARCWriter writer = createArcWithOneRecord(name, compress);
396:                // Add a record with a length that is too long.
397:                String content = getContent();
398:                writeRecord(writer, SOME_URL, "text/html",
399:                        content.length() + 10, getBaos(content));
400:                writeRecord(writer, SOME_URL, "text/html", content.length(),
401:                        getBaos(content));
402:                writer.close();
403:
404:                // Catch System.err.
405:                ByteArrayOutputStream os = new ByteArrayOutputStream();
406:                System.setErr(new PrintStream(os));
407:
408:                ARCReader r = ARCReaderFactory.get(writer.getFile());
409:                r.setStrict(strict);
410:                int count = iterateRecords(r);
411:                assertTrue("Count wrong " + count, count == 4);
412:
413:                // Make sure we get the warning string which complains about the
414:                // trailing bytes.
415:                String err = os.toString();
416:                assertTrue(
417:                        "No message " + err,
418:                        err
419:                                .startsWith("WARNING Premature EOF before end-of-record"));
420:            }
421:
422:            public void testGapError() throws IOException {
423:                ARCWriter writer = createArcWithOneRecord("testGapError", true);
424:                String content = getContent();
425:                // Make a 'weird' RIS that returns bad 'remaining' length
426:                // after the call to readFullyTo.
427:                ReplayInputStream ris = new ReplayInputStream(content
428:                        .getBytes(), content.length(), null) {
429:                    private boolean readFullyToCalled = false;
430:
431:                    public void readFullyTo(OutputStream os) throws IOException {
432:                        super .readFullyTo(os);
433:                        this .readFullyToCalled = true;
434:                    }
435:
436:                    public long remaining() {
437:                        return (this .readFullyToCalled) ? -1 : super 
438:                                .remaining();
439:                    }
440:                };
441:                String message = null;
442:                try {
443:                    writer.write(SOME_URL, "text/html", "192.168.1.1",
444:                            (new Date()).getTime(), content.length(), ris);
445:                } catch (IOException e) {
446:                    message = e.getMessage();
447:                }
448:                writer.close();
449:                assertTrue(
450:                        "No gap when should be",
451:                        message != null
452:                                && message
453:                                        .indexOf("Gap between expected and actual") >= 0);
454:            }
455:
456:            /**
457:             * Write an arc file for other tests to use.
458:             * @param arcdir Directory to write to.
459:             * @param compress True if file should be compressed.
460:             * @return ARC written.
461:             * @throws IOException 
462:             */
463:            public static File createARCFile(File arcdir, boolean compress)
464:                    throws IOException {
465:                File[] files = { arcdir };
466:                ARCWriter writer = new ARCWriter(SERIAL_NO, Arrays
467:                        .asList(files), "test", compress,
468:                        DEFAULT_MAX_ARC_FILE_SIZE);
469:                String content = getContent();
470:                writeRecord(writer, SOME_URL, "text/html", content.length(),
471:                        getBaos(content));
472:                writer.close();
473:                return writer.getFile();
474:            }
475:
476:            //    public void testSpeed() throws IOException {
477:            //        ARCWriter writer = createArcWithOneRecord("speed", true);
478:            //        // Add a record with a length that is too long.
479:            //        String content = getContent();
480:            //        final int count = 100000;
481:            //        logger.info("Starting speed write of " + count + " records.");
482:            //        for (int i = 0; i < count; i++) {
483:            //            writeRecord(writer, SOME_URL, "text/html", content.length(),
484:            //                    getBaos(content));
485:            //        }
486:            //        writer.close();
487:            //        logger.info("Finished speed write test.");
488:            //    }
489:
490:            public void testValidateMetaLine() throws Exception {
491:                final String line = "http://www.aandw.net/images/walden2.png "
492:                        + "128.197.34.86 20060111174224 image/png 2160";
493:                ARCWriter w = createARCWriter("testValidateMetaLine", true);
494:                try {
495:                    w.validateMetaLine(line);
496:                    w.validateMetaLine(line + LINE_SEPARATOR);
497:                    w.validateMetaLine(line + "\\r\\n");
498:                } finally {
499:                    w.close();
500:                }
501:            }
502:
503:            public void testArcRecordOffsetReads() throws Exception {
504:                // Get an ARC with one record.
505:                WriterPoolMember w = createArcWithOneRecord(
506:                        "testArcRecordInBufferStream", true);
507:                w.close();
508:                // Get reader on said ARC.
509:                ARCReader r = ARCReaderFactory.get(w.getFile());
510:                final Iterator i = r.iterator();
511:                // Skip first ARC meta record.
512:                ARCRecord ar = (ARCRecord) i.next();
513:                i.hasNext();
514:                // Now we're at first and only record in ARC.
515:                ar = (ARCRecord) i.next();
516:                // Now try getting some random set of bytes out of it 
517:                // at an odd offset (used to fail because we were
518:                // doing bad math to find where in buffer to read).
519:                final byte[] buffer = new byte[17];
520:                final int maxRead = 4;
521:                int totalRead = 0;
522:                while (totalRead < maxRead) {
523:                    totalRead = totalRead
524:                            + ar.read(buffer, 13 + totalRead, maxRead
525:                                    - totalRead);
526:                    assertTrue(totalRead > 0);
527:                }
528:            }
529:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.