Source Code Cross Referenced for ARCReaderFactory.java in  » Web-Crawler » heritrix » org » archive » io » arc » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.io.arc 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* ARCReaderFactory
002:         *
003:         * $Id: ARCReaderFactory.java 4950 2007-03-01 20:31:19Z stack-sf $
004:         *
005:         * Created on May 1, 2004
006:         *
007:         * Copyright (C) 2004 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.io.arc;
026:
027:        import java.io.File;
028:        import java.io.FileInputStream;
029:        import java.io.IOException;
030:        import java.io.InputStream;
031:        import java.net.MalformedURLException;
032:        import java.net.URL;
033:        import java.util.Iterator;
034:        import java.util.logging.Level;
035:
036:        import org.archive.io.ArchiveReader;
037:        import org.archive.io.ArchiveReaderFactory;
038:        import org.archive.io.ArchiveRecord;
039:        import org.archive.io.ArchiveRecordHeader;
040:        import org.archive.io.GzipHeader;
041:        import org.archive.io.GzippedInputStream;
042:        import org.archive.io.NoGzipMagicException;
043:        import org.archive.util.FileUtils;
044:
045:        /**
046:         * Factory that returns an ARCReader.
047:         * 
048:         * Can handle compressed and uncompressed ARCs.
049:         *
050:         * @author stack
051:         */
052:        public class ARCReaderFactory extends ArchiveReaderFactory implements 
053:                ARCConstants {
054:            /**
055:             * This factory instance.
056:             */
057:            private static final ARCReaderFactory factory = new ARCReaderFactory();
058:
059:            /**
060:             * Shutdown any access to default constructor.
061:             */
062:            protected ARCReaderFactory() {
063:                super ();
064:            }
065:
066:            public static ARCReader get(String arcFileOrUrl)
067:                    throws MalformedURLException, IOException {
068:                return (ARCReader) ARCReaderFactory.factory
069:                        .getArchiveReader(arcFileOrUrl);
070:            }
071:
072:            public static ARCReader get(String arcFileOrUrl, final long offset)
073:                    throws MalformedURLException, IOException {
074:                return (ARCReader) ARCReaderFactory.factory.getArchiveReader(
075:                        arcFileOrUrl, offset);
076:            }
077:
078:            public static ARCReader get(final File f) throws IOException {
079:                return (ARCReader) ARCReaderFactory.factory.getArchiveReader(f);
080:            }
081:
082:            public static ARCReader get(final File f, final long offset)
083:                    throws IOException {
084:                return (ARCReader) ARCReaderFactory.factory.getArchiveReader(f,
085:                        offset);
086:            }
087:
088:            protected ArchiveReader getArchiveReader(final File f,
089:                    final long offset) throws IOException {
090:                return getArchiveReader(f, true, offset);
091:            }
092:
093:            /**
094:             * @param f An arcfile to read.
095:             * @param skipSuffixTest Set to true if want to test that ARC has proper
096:             * suffix. Use this method and pass <code>false</code> to open ARCs
097:             * with the <code>.open</code> or otherwise suffix.
098:             * @param offset Have returned ARCReader set to start reading at passed
099:             * offset.
100:             * @return An ARCReader.
101:             * @throws IOException 
102:             */
103:            public static ARCReader get(final File f,
104:                    final boolean skipSuffixTest, final long offset)
105:                    throws IOException {
106:                return (ARCReader) ARCReaderFactory.factory.getArchiveReader(f,
107:                        skipSuffixTest, offset);
108:            }
109:
110:            protected ArchiveReader getArchiveReader(final File arcFile,
111:                    final boolean skipSuffixTest, final long offset)
112:                    throws IOException {
113:                boolean compressed = testCompressedARCFile(arcFile,
114:                        skipSuffixTest);
115:                if (!compressed) {
116:                    if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
117:                            ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
118:                        throw new IOException(arcFile.getAbsolutePath()
119:                                + " is not an Internet Archive ARC file.");
120:                    }
121:                }
122:                return compressed ? (ARCReader) ARCReaderFactory.factory.new CompressedARCReader(
123:                        arcFile, offset)
124:                        : (ARCReader) ARCReaderFactory.factory.new UncompressedARCReader(
125:                                arcFile, offset);
126:            }
127:
128:            public static ArchiveReader get(final String s,
129:                    final InputStream is, final boolean atFirstRecord)
130:                    throws IOException {
131:                return ARCReaderFactory.factory.getArchiveReader(s, is,
132:                        atFirstRecord);
133:            }
134:
135:            protected ArchiveReader getArchiveReader(final String arc,
136:                    final InputStream is, final boolean atFirstRecord)
137:                    throws IOException {
138:                // For now, assume stream is compressed. Later add test of input
139:                // stream or handle exception thrown when figure not compressed stream.
140:                return new CompressedARCReader(arc, asRepositionable(is),
141:                        atFirstRecord);
142:            }
143:
144:            /**
145:             * Get an ARCReader aligned at <code>offset</code>. This version of get
146:             * will not bring the ARC local but will try to stream across the net making
147:             * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
148:             * 
149:             * @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
150:             * @param offset Offset into ARC at which to start fetching.
151:             * @return An ARCReader aligned at offset.
152:             * @throws IOException
153:             */
154:            public static ARCReader get(final URL arcUrl, final long offset)
155:                    throws IOException {
156:                return (ARCReader) ARCReaderFactory.factory.getArchiveReader(
157:                        arcUrl, offset);
158:            }
159:
160:            /**
161:             * Get an ARCReader.
162:             * Pulls the ARC local into whereever the System Property
163:             * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
164:             * points at this local copy.  A close on this ARCReader instance will
165:             * remove the local copy.
166:             * @param arcUrl An URL that points at an ARC.
167:             * @return An ARCReader.
168:             * @throws IOException 
169:             */
170:            public static ARCReader get(final URL arcUrl) throws IOException {
171:                return (ARCReader) ARCReaderFactory.factory
172:                        .getArchiveReader(arcUrl);
173:            }
174:
175:            /**
176:             * @param arcFile File to test.
177:             * @return True if <code>arcFile</code> is compressed ARC.
178:             * @throws IOException
179:             */
180:            public boolean isCompressed(File arcFile) throws IOException {
181:                return testCompressedARCFile(arcFile);
182:            }
183:
184:            /**
185:             * Check file is compressed and in ARC GZIP format.
186:             *
187:             * @param arcFile File to test if its Internet Archive ARC file
188:             * GZIP compressed.
189:             *
190:             * @return True if this is an Internet Archive GZIP'd ARC file (It begins
191:             * w/ the Internet Archive GZIP header and has the
192:             * COMPRESSED_ARC_FILE_EXTENSION suffix).
193:             *
194:             * @exception IOException If file does not exist or is not unreadable.
195:             */
196:            public static boolean testCompressedARCFile(File arcFile)
197:                    throws IOException {
198:                return testCompressedARCFile(arcFile, false);
199:            }
200:
201:            /**
202:             * Check file is compressed and in ARC GZIP format.
203:             *
204:             * @param arcFile File to test if its Internet Archive ARC file
205:             * GZIP compressed.
206:             * @param skipSuffixCheck Set to true if we're not to test on the
207:             * '.arc.gz' suffix.
208:             *
209:             * @return True if this is an Internet Archive GZIP'd ARC file (It begins
210:             * w/ the Internet Archive GZIP header).
211:             *
212:             * @exception IOException If file does not exist or is not unreadable.
213:             */
214:            public static boolean testCompressedARCFile(File arcFile,
215:                    boolean skipSuffixCheck) throws IOException {
216:                boolean compressedARCFile = false;
217:                FileUtils.isReadable(arcFile);
218:                if (!skipSuffixCheck
219:                        && !arcFile.getName().toLowerCase().endsWith(
220:                                COMPRESSED_ARC_FILE_EXTENSION)) {
221:                    return compressedARCFile;
222:                }
223:
224:                final InputStream is = new FileInputStream(arcFile);
225:                try {
226:                    compressedARCFile = testCompressedARCStream(is);
227:                } finally {
228:                    is.close();
229:                }
230:                return compressedARCFile;
231:            }
232:
233:            public static boolean isARCSuffix(final String arcName) {
234:                return (arcName == null) ? false : (arcName.toLowerCase()
235:                        .endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION)) ? true
236:                        : (arcName.toLowerCase()
237:                                .endsWith(DOT_ARC_FILE_EXTENSION)) ? true
238:                                : false;
239:            }
240:
241:            /**
242:             * Tests passed stream is gzip stream by reading in the HEAD.
243:             * Does not reposition the stream.  That is left up to the caller.
244:             * @param is An InputStream.
245:             * @return True if compressed stream.
246:             * @throws IOException
247:             */
248:            public static boolean testCompressedARCStream(final InputStream is)
249:                    throws IOException {
250:                boolean compressedARCFile = false;
251:                GzipHeader gh = null;
252:                try {
253:                    gh = new GzipHeader(is);
254:                } catch (NoGzipMagicException e) {
255:                    return compressedARCFile;
256:                }
257:
258:                byte[] fextra = gh.getFextra();
259:                // Now make sure following bytes are IA GZIP comment.
260:                // First check length. ARC_GZIP_EXTRA_FIELD includes length
261:                // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
262:                // at +2.
263:                if (fextra != null
264:                        && ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
265:                    compressedARCFile = true;
266:                    for (int i = 0; i < fextra.length; i++) {
267:                        if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
268:                            compressedARCFile = false;
269:                            break;
270:                        }
271:                    }
272:                }
273:                return compressedARCFile;
274:            }
275:
276:            /**
277:             * Uncompressed arc file reader.
278:             * @author stack
279:             */
280:            private class UncompressedARCReader extends ARCReader {
281:                /**
282:                 * Constructor.
283:                 * @param f Uncompressed arcfile to read.
284:                 * @throws IOException
285:                 */
286:                public UncompressedARCReader(final File f) throws IOException {
287:                    this (f, 0);
288:                }
289:
290:                /**
291:                 * Constructor.
292:                 * 
293:                 * @param f Uncompressed arcfile to read.
294:                 * @param offset Offset at which to position ARCReader.
295:                 * @throws IOException
296:                 */
297:                public UncompressedARCReader(final File f, final long offset)
298:                        throws IOException {
299:                    // Arc file has been tested for existence by time it has come
300:                    // to here.
301:                    setIn(getInputStream(f, offset));
302:                    initialize(f.getAbsolutePath());
303:                }
304:
305:                /**
306:                 * Constructor.
307:                 * 
308:                 * @param f Uncompressed arc to read.
309:                 * @param is InputStream.
310:                 */
311:                public UncompressedARCReader(final String f,
312:                        final InputStream is) {
313:                    // Arc file has been tested for existence by time it has come
314:                    // to here.
315:                    setIn(is);
316:                    initialize(f);
317:                }
318:            }
319:
320:            /**
321:             * Compressed arc file reader.
322:             * 
323:             * @author stack
324:             */
325:            private class CompressedARCReader extends ARCReader {
326:
327:                /**
328:                 * Constructor.
329:                 * 
330:                 * @param f
331:                 *            Compressed arcfile to read.
332:                 * @throws IOException
333:                 */
334:                public CompressedARCReader(final File f) throws IOException {
335:                    this (f, 0);
336:                }
337:
338:                /**
339:                 * Constructor.
340:                 * 
341:                 * @param f Compressed arcfile to read.
342:                 * @param offset Position at where to start reading file.
343:                 * @throws IOException
344:                 */
345:                public CompressedARCReader(final File f, final long offset)
346:                        throws IOException {
347:                    // Arc file has been tested for existence by time it has come
348:                    // to here.
349:                    setIn(new GzippedInputStream(getInputStream(f, offset)));
350:                    setCompressed((offset == 0));
351:                    initialize(f.getAbsolutePath());
352:                }
353:
354:                /**
355:                 * Constructor.
356:                 * 
357:                 * @param f Compressed arcfile.
358:                 * @param is InputStream to use.
359:                 * @throws IOException
360:                 */
361:                public CompressedARCReader(final String f,
362:                        final InputStream is, final boolean atFirstRecord)
363:                        throws IOException {
364:                    // Arc file has been tested for existence by time it has come
365:                    // to here.
366:                    setIn(new GzippedInputStream(is));
367:                    setCompressed(true);
368:                    setAlignedOnFirstRecord(atFirstRecord);
369:                    initialize(f);
370:                }
371:
372:                /**
373:                 * Get record at passed <code>offset</code>.
374:                 * 
375:                 * @param offset
376:                 *            Byte index into arcfile at which a record starts.
377:                 * @return An ARCRecord reference.
378:                 * @throws IOException
379:                 */
380:                public ARCRecord get(long offset) throws IOException {
381:                    cleanupCurrentRecord();
382:                    ((GzippedInputStream) getIn()).gzipMemberSeek(offset);
383:                    return createArchiveRecord(getIn(), offset);
384:                }
385:
386:                public Iterator<ArchiveRecord> iterator() {
387:                    /**
388:                     * Override ARCRecordIterator so can base returned iterator on
389:                     * GzippedInputStream iterator.
390:                     */
391:                    return new ArchiveRecordIterator() {
392:                        private GzippedInputStream gis = (GzippedInputStream) getInputStream();
393:
394:                        private Iterator gzipIterator = this .gis.iterator();
395:
396:                        protected boolean innerHasNext() {
397:                            return this .gzipIterator.hasNext();
398:                        }
399:
400:                        protected ArchiveRecord innerNext() throws IOException {
401:                            // Get the position before gzipIterator.next moves
402:                            // it on past the gzip header.
403:                            long p = this .gis.position();
404:                            InputStream is = (InputStream) this .gzipIterator
405:                                    .next();
406:                            return createArchiveRecord(is, p);
407:                        }
408:                    };
409:                }
410:
411:                protected void gotoEOR(ArchiveRecord rec) throws IOException {
412:                    long skipped = ((GzippedInputStream) getIn())
413:                            .gotoEOR(LINE_SEPARATOR);
414:                    if (skipped <= 0) {
415:                        return;
416:                    }
417:                    // Report on system error the number of unexpected characters
418:                    // at the end of this record.
419:                    ArchiveRecordHeader meta = (getCurrentRecord() != null) ? rec
420:                            .getHeader()
421:                            : null;
422:                    String message = "Record ENDING at "
423:                            + ((GzippedInputStream) getIn()).position()
424:                            + " has " + skipped + " trailing byte(s): "
425:                            + ((meta != null) ? meta.toString() : "");
426:                    if (isStrict()) {
427:                        throw new IOException(message);
428:                    }
429:                    logStdErr(Level.WARNING, message);
430:                }
431:            }
432:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.