001: /* ARCReaderFactory
002: *
003: * $Id: ARCReaderFactory.java 4950 2007-03-01 20:31:19Z stack-sf $
004: *
005: * Created on May 1, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.io.arc;
026:
027: import java.io.File;
028: import java.io.FileInputStream;
029: import java.io.IOException;
030: import java.io.InputStream;
031: import java.net.MalformedURLException;
032: import java.net.URL;
033: import java.util.Iterator;
034: import java.util.logging.Level;
035:
036: import org.archive.io.ArchiveReader;
037: import org.archive.io.ArchiveReaderFactory;
038: import org.archive.io.ArchiveRecord;
039: import org.archive.io.ArchiveRecordHeader;
040: import org.archive.io.GzipHeader;
041: import org.archive.io.GzippedInputStream;
042: import org.archive.io.NoGzipMagicException;
043: import org.archive.util.FileUtils;
044:
045: /**
046: * Factory that returns an ARCReader.
047: *
048: * Can handle compressed and uncompressed ARCs.
049: *
050: * @author stack
051: */
052: public class ARCReaderFactory extends ArchiveReaderFactory implements
053: ARCConstants {
054: /**
055: * This factory instance.
056: */
057: private static final ARCReaderFactory factory = new ARCReaderFactory();
058:
059: /**
060: * Shutdown any access to default constructor.
061: */
062: protected ARCReaderFactory() {
063: super ();
064: }
065:
066: public static ARCReader get(String arcFileOrUrl)
067: throws MalformedURLException, IOException {
068: return (ARCReader) ARCReaderFactory.factory
069: .getArchiveReader(arcFileOrUrl);
070: }
071:
072: public static ARCReader get(String arcFileOrUrl, final long offset)
073: throws MalformedURLException, IOException {
074: return (ARCReader) ARCReaderFactory.factory.getArchiveReader(
075: arcFileOrUrl, offset);
076: }
077:
078: public static ARCReader get(final File f) throws IOException {
079: return (ARCReader) ARCReaderFactory.factory.getArchiveReader(f);
080: }
081:
082: public static ARCReader get(final File f, final long offset)
083: throws IOException {
084: return (ARCReader) ARCReaderFactory.factory.getArchiveReader(f,
085: offset);
086: }
087:
088: protected ArchiveReader getArchiveReader(final File f,
089: final long offset) throws IOException {
090: return getArchiveReader(f, true, offset);
091: }
092:
093: /**
094: * @param f An arcfile to read.
095: * @param skipSuffixTest Set to true if want to test that ARC has proper
096: * suffix. Use this method and pass <code>false</code> to open ARCs
097: * with the <code>.open</code> or otherwise suffix.
098: * @param offset Have returned ARCReader set to start reading at passed
099: * offset.
100: * @return An ARCReader.
101: * @throws IOException
102: */
103: public static ARCReader get(final File f,
104: final boolean skipSuffixTest, final long offset)
105: throws IOException {
106: return (ARCReader) ARCReaderFactory.factory.getArchiveReader(f,
107: skipSuffixTest, offset);
108: }
109:
110: protected ArchiveReader getArchiveReader(final File arcFile,
111: final boolean skipSuffixTest, final long offset)
112: throws IOException {
113: boolean compressed = testCompressedARCFile(arcFile,
114: skipSuffixTest);
115: if (!compressed) {
116: if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
117: ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
118: throw new IOException(arcFile.getAbsolutePath()
119: + " is not an Internet Archive ARC file.");
120: }
121: }
122: return compressed ? (ARCReader) ARCReaderFactory.factory.new CompressedARCReader(
123: arcFile, offset)
124: : (ARCReader) ARCReaderFactory.factory.new UncompressedARCReader(
125: arcFile, offset);
126: }
127:
128: public static ArchiveReader get(final String s,
129: final InputStream is, final boolean atFirstRecord)
130: throws IOException {
131: return ARCReaderFactory.factory.getArchiveReader(s, is,
132: atFirstRecord);
133: }
134:
135: protected ArchiveReader getArchiveReader(final String arc,
136: final InputStream is, final boolean atFirstRecord)
137: throws IOException {
138: // For now, assume stream is compressed. Later add test of input
139: // stream or handle exception thrown when figure not compressed stream.
140: return new CompressedARCReader(arc, asRepositionable(is),
141: atFirstRecord);
142: }
143:
144: /**
145: * Get an ARCReader aligned at <code>offset</code>. This version of get
146: * will not bring the ARC local but will try to stream across the net making
147: * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
148: *
149: * @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
150: * @param offset Offset into ARC at which to start fetching.
151: * @return An ARCReader aligned at offset.
152: * @throws IOException
153: */
154: public static ARCReader get(final URL arcUrl, final long offset)
155: throws IOException {
156: return (ARCReader) ARCReaderFactory.factory.getArchiveReader(
157: arcUrl, offset);
158: }
159:
160: /**
161: * Get an ARCReader.
162: * Pulls the ARC local into whereever the System Property
163: * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
164: * points at this local copy. A close on this ARCReader instance will
165: * remove the local copy.
166: * @param arcUrl An URL that points at an ARC.
167: * @return An ARCReader.
168: * @throws IOException
169: */
170: public static ARCReader get(final URL arcUrl) throws IOException {
171: return (ARCReader) ARCReaderFactory.factory
172: .getArchiveReader(arcUrl);
173: }
174:
175: /**
176: * @param arcFile File to test.
177: * @return True if <code>arcFile</code> is compressed ARC.
178: * @throws IOException
179: */
180: public boolean isCompressed(File arcFile) throws IOException {
181: return testCompressedARCFile(arcFile);
182: }
183:
184: /**
185: * Check file is compressed and in ARC GZIP format.
186: *
187: * @param arcFile File to test if its Internet Archive ARC file
188: * GZIP compressed.
189: *
190: * @return True if this is an Internet Archive GZIP'd ARC file (It begins
191: * w/ the Internet Archive GZIP header and has the
192: * COMPRESSED_ARC_FILE_EXTENSION suffix).
193: *
194: * @exception IOException If file does not exist or is not unreadable.
195: */
196: public static boolean testCompressedARCFile(File arcFile)
197: throws IOException {
198: return testCompressedARCFile(arcFile, false);
199: }
200:
201: /**
202: * Check file is compressed and in ARC GZIP format.
203: *
204: * @param arcFile File to test if its Internet Archive ARC file
205: * GZIP compressed.
206: * @param skipSuffixCheck Set to true if we're not to test on the
207: * '.arc.gz' suffix.
208: *
209: * @return True if this is an Internet Archive GZIP'd ARC file (It begins
210: * w/ the Internet Archive GZIP header).
211: *
212: * @exception IOException If file does not exist or is not unreadable.
213: */
214: public static boolean testCompressedARCFile(File arcFile,
215: boolean skipSuffixCheck) throws IOException {
216: boolean compressedARCFile = false;
217: FileUtils.isReadable(arcFile);
218: if (!skipSuffixCheck
219: && !arcFile.getName().toLowerCase().endsWith(
220: COMPRESSED_ARC_FILE_EXTENSION)) {
221: return compressedARCFile;
222: }
223:
224: final InputStream is = new FileInputStream(arcFile);
225: try {
226: compressedARCFile = testCompressedARCStream(is);
227: } finally {
228: is.close();
229: }
230: return compressedARCFile;
231: }
232:
233: public static boolean isARCSuffix(final String arcName) {
234: return (arcName == null) ? false : (arcName.toLowerCase()
235: .endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION)) ? true
236: : (arcName.toLowerCase()
237: .endsWith(DOT_ARC_FILE_EXTENSION)) ? true
238: : false;
239: }
240:
241: /**
242: * Tests passed stream is gzip stream by reading in the HEAD.
243: * Does not reposition the stream. That is left up to the caller.
244: * @param is An InputStream.
245: * @return True if compressed stream.
246: * @throws IOException
247: */
248: public static boolean testCompressedARCStream(final InputStream is)
249: throws IOException {
250: boolean compressedARCFile = false;
251: GzipHeader gh = null;
252: try {
253: gh = new GzipHeader(is);
254: } catch (NoGzipMagicException e) {
255: return compressedARCFile;
256: }
257:
258: byte[] fextra = gh.getFextra();
259: // Now make sure following bytes are IA GZIP comment.
260: // First check length. ARC_GZIP_EXTRA_FIELD includes length
261: // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
262: // at +2.
263: if (fextra != null
264: && ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
265: compressedARCFile = true;
266: for (int i = 0; i < fextra.length; i++) {
267: if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
268: compressedARCFile = false;
269: break;
270: }
271: }
272: }
273: return compressedARCFile;
274: }
275:
276: /**
277: * Uncompressed arc file reader.
278: * @author stack
279: */
280: private class UncompressedARCReader extends ARCReader {
281: /**
282: * Constructor.
283: * @param f Uncompressed arcfile to read.
284: * @throws IOException
285: */
286: public UncompressedARCReader(final File f) throws IOException {
287: this (f, 0);
288: }
289:
290: /**
291: * Constructor.
292: *
293: * @param f Uncompressed arcfile to read.
294: * @param offset Offset at which to position ARCReader.
295: * @throws IOException
296: */
297: public UncompressedARCReader(final File f, final long offset)
298: throws IOException {
299: // Arc file has been tested for existence by time it has come
300: // to here.
301: setIn(getInputStream(f, offset));
302: initialize(f.getAbsolutePath());
303: }
304:
305: /**
306: * Constructor.
307: *
308: * @param f Uncompressed arc to read.
309: * @param is InputStream.
310: */
311: public UncompressedARCReader(final String f,
312: final InputStream is) {
313: // Arc file has been tested for existence by time it has come
314: // to here.
315: setIn(is);
316: initialize(f);
317: }
318: }
319:
320: /**
321: * Compressed arc file reader.
322: *
323: * @author stack
324: */
325: private class CompressedARCReader extends ARCReader {
326:
327: /**
328: * Constructor.
329: *
330: * @param f
331: * Compressed arcfile to read.
332: * @throws IOException
333: */
334: public CompressedARCReader(final File f) throws IOException {
335: this (f, 0);
336: }
337:
338: /**
339: * Constructor.
340: *
341: * @param f Compressed arcfile to read.
342: * @param offset Position at where to start reading file.
343: * @throws IOException
344: */
345: public CompressedARCReader(final File f, final long offset)
346: throws IOException {
347: // Arc file has been tested for existence by time it has come
348: // to here.
349: setIn(new GzippedInputStream(getInputStream(f, offset)));
350: setCompressed((offset == 0));
351: initialize(f.getAbsolutePath());
352: }
353:
354: /**
355: * Constructor.
356: *
357: * @param f Compressed arcfile.
358: * @param is InputStream to use.
359: * @throws IOException
360: */
361: public CompressedARCReader(final String f,
362: final InputStream is, final boolean atFirstRecord)
363: throws IOException {
364: // Arc file has been tested for existence by time it has come
365: // to here.
366: setIn(new GzippedInputStream(is));
367: setCompressed(true);
368: setAlignedOnFirstRecord(atFirstRecord);
369: initialize(f);
370: }
371:
372: /**
373: * Get record at passed <code>offset</code>.
374: *
375: * @param offset
376: * Byte index into arcfile at which a record starts.
377: * @return An ARCRecord reference.
378: * @throws IOException
379: */
380: public ARCRecord get(long offset) throws IOException {
381: cleanupCurrentRecord();
382: ((GzippedInputStream) getIn()).gzipMemberSeek(offset);
383: return createArchiveRecord(getIn(), offset);
384: }
385:
386: public Iterator<ArchiveRecord> iterator() {
387: /**
388: * Override ARCRecordIterator so can base returned iterator on
389: * GzippedInputStream iterator.
390: */
391: return new ArchiveRecordIterator() {
392: private GzippedInputStream gis = (GzippedInputStream) getInputStream();
393:
394: private Iterator gzipIterator = this .gis.iterator();
395:
396: protected boolean innerHasNext() {
397: return this .gzipIterator.hasNext();
398: }
399:
400: protected ArchiveRecord innerNext() throws IOException {
401: // Get the position before gzipIterator.next moves
402: // it on past the gzip header.
403: long p = this .gis.position();
404: InputStream is = (InputStream) this .gzipIterator
405: .next();
406: return createArchiveRecord(is, p);
407: }
408: };
409: }
410:
411: protected void gotoEOR(ArchiveRecord rec) throws IOException {
412: long skipped = ((GzippedInputStream) getIn())
413: .gotoEOR(LINE_SEPARATOR);
414: if (skipped <= 0) {
415: return;
416: }
417: // Report on system error the number of unexpected characters
418: // at the end of this record.
419: ArchiveRecordHeader meta = (getCurrentRecord() != null) ? rec
420: .getHeader()
421: : null;
422: String message = "Record ENDING at "
423: + ((GzippedInputStream) getIn()).position()
424: + " has " + skipped + " trailing byte(s): "
425: + ((meta != null) ? meta.toString() : "");
426: if (isStrict()) {
427: throw new IOException(message);
428: }
429: logStdErr(Level.WARNING, message);
430: }
431: }
432: }
|