001: /* $Id: WARCReaderFactory.java 4533 2006-08-24 00:59:04Z stack-sf $
002: *
003: * Created Aug 22, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.warc;
024:
025: import java.io.File;
026: import java.io.FileInputStream;
027: import java.io.IOException;
028: import java.io.InputStream;
029: import java.net.MalformedURLException;
030: import java.net.URL;
031: import java.util.Iterator;
032:
033: import org.archive.io.ArchiveReader;
034: import org.archive.io.ArchiveReaderFactory;
035: import org.archive.io.ArchiveRecord;
036: import org.archive.io.GzippedInputStream;
037: import org.archive.io.warc.WARCConstants;
038: import org.archive.util.FileUtils;
039:
040: /**
041: * Factory for WARC Readers.
042: * Figures whether to give out a compressed file Reader or an uncompressed
043: * Reader.
044: * @author stack
045: * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$
046: */
047: public class WARCReaderFactory extends ArchiveReaderFactory implements
048: WARCConstants {
049: private static final WARCReaderFactory factory = new WARCReaderFactory();
050:
051: /**
052: * Shutdown any access to default constructor.
053: * This factory is Singleton.
054: */
055: private WARCReaderFactory() {
056: super ();
057: }
058:
059: public static WARCReader get(String arcFileOrUrl)
060: throws MalformedURLException, IOException {
061: return (WARCReader) WARCReaderFactory.factory
062: .getArchiveReader(arcFileOrUrl);
063: }
064:
065: public static WARCReader get(final File f) throws IOException {
066: return (WARCReader) WARCReaderFactory.factory
067: .getArchiveReader(f);
068: }
069:
070: /**
071: * @param f An arcfile to read.
072: * @param offset Have returned Reader set to start reading at this offset.
073: * @return A WARCReader.
074: * @throws IOException
075: */
076: public static WARCReader get(final File f, final long offset)
077: throws IOException {
078: return (WARCReader) WARCReaderFactory.factory.getArchiveReader(
079: f, offset);
080: }
081:
082: protected ArchiveReader getArchiveReader(final File f,
083: final long offset) throws IOException {
084: boolean compressed = testCompressedWARCFile(f);
085: if (!compressed) {
086: if (!FileUtils.isReadableWithExtensionAndMagic(f,
087: DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
088: throw new IOException(f.getAbsolutePath()
089: + " is not a WARC file.");
090: }
091: }
092: return (WARCReader) (compressed ? WARCReaderFactory.factory.new CompressedWARCReader(
093: f, offset)
094: : WARCReaderFactory.factory.new UncompressedWARCReader(
095: f, offset));
096: }
097:
098: public static ArchiveReader get(final String s,
099: final InputStream is, final boolean atFirstRecord)
100: throws IOException {
101: return WARCReaderFactory.factory.getArchiveReader(s, is,
102: atFirstRecord);
103: }
104:
105: protected ArchiveReader getArchiveReader(final String f,
106: final InputStream is, final boolean atFirstRecord)
107: throws IOException {
108: // For now, assume stream is compressed. Later add test of input
109: // stream or handle exception thrown when figure not compressed stream.
110: return new CompressedWARCReader(f, is, atFirstRecord);
111: }
112:
113: public static WARCReader get(final URL arcUrl, final long offset)
114: throws IOException {
115: return (WARCReader) WARCReaderFactory.factory.getArchiveReader(
116: arcUrl, offset);
117: }
118:
119: /**
120: * Get an ARCReader.
121: * Pulls the ARC local into whereever the System Property
122: * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
123: * points at this local copy. A close on this ARCReader instance will
124: * remove the local copy.
125: * @param arcUrl An URL that points at an ARC.
126: * @return An ARCReader.
127: * @throws IOException
128: */
129: public static WARCReader get(final URL arcUrl) throws IOException {
130: return (WARCReader) WARCReaderFactory.factory
131: .getArchiveReader(arcUrl);
132: }
133:
134: /**
135: * Check file is compressed WARC.
136: *
137: * @param f File to test.
138: *
139: * @return True if this is compressed WARC (TODO: Just tests if file is
140: * GZIP'd file (It begins w/ GZIP MAGIC)).
141: *
142: * @exception IOException If file does not exist or is not unreadable.
143: */
144: public static boolean testCompressedWARCFile(final File f)
145: throws IOException {
146: FileUtils.isReadable(f);
147: boolean compressed = false;
148: final InputStream is = new FileInputStream(f);
149: try {
150: compressed = GzippedInputStream.isCompressedStream(is);
151: } finally {
152: is.close();
153: }
154: return compressed;
155: }
156:
157: /**
158: * Uncompressed WARC file reader.
159: * @author stack
160: */
161: private class UncompressedWARCReader extends WARCReader {
162: /**
163: * Constructor.
164: * @param f Uncompressed arcfile to read.
165: * @throws IOException
166: */
167: public UncompressedWARCReader(final File f) throws IOException {
168: this (f, 0);
169: }
170:
171: /**
172: * Constructor.
173: *
174: * @param f Uncompressed file to read.
175: * @param offset Offset at which to position Reader.
176: * @throws IOException
177: */
178: public UncompressedWARCReader(final File f, final long offset)
179: throws IOException {
180: // File has been tested for existence by time it has come to here.
181: setIn(getInputStream(f, offset));
182: initialize(f.getAbsolutePath());
183: }
184:
185: /**
186: * Constructor.
187: *
188: * @param f Uncompressed file to read.
189: * @param is InputStream.
190: */
191: public UncompressedWARCReader(final String f,
192: final InputStream is) {
193: // Arc file has been tested for existence by time it has come
194: // to here.
195: setIn(is);
196: initialize(f);
197: }
198: }
199:
200: /**
201: * Compressed WARC file reader.
202: *
203: * @author stack
204: */
205: private class CompressedWARCReader extends WARCReader {
206: /**
207: * Constructor.
208: *
209: * @param f Compressed file to read.
210: * @throws IOException
211: */
212: public CompressedWARCReader(final File f) throws IOException {
213: this (f, 0);
214: }
215:
216: /**
217: * Constructor.
218: *
219: * @param f Compressed arcfile to read.
220: * @param offset Position at where to start reading file.
221: * @throws IOException
222: */
223: public CompressedWARCReader(final File f, final long offset)
224: throws IOException {
225: // File has been tested for existence by time it has come to here.
226: setIn(new GzippedInputStream(getInputStream(f, offset)));
227: setCompressed((offset == 0));
228: initialize(f.getAbsolutePath());
229: }
230:
231: /**
232: * Constructor.
233: *
234: * @param f Compressed arcfile.
235: * @param is InputStream to use.
236: * @param atFirstRecord
237: * @throws IOException
238: */
239: public CompressedWARCReader(final String f,
240: final InputStream is, final boolean atFirstRecord)
241: throws IOException {
242: // Arc file has been tested for existence by time it has come
243: // to here.
244: setIn(new GzippedInputStream(is));
245: setCompressed(true);
246: initialize(f);
247: // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world.
248: }
249:
250: /**
251: * Get record at passed <code>offset</code>.
252: *
253: * @param offset Byte index into file at which a record starts.
254: * @return A WARCRecord reference.
255: * @throws IOException
256: */
257: public WARCRecord get(long offset) throws IOException {
258: cleanupCurrentRecord();
259: ((GzippedInputStream) getIn()).gzipMemberSeek(offset);
260: return (WARCRecord) createArchiveRecord(getIn(), offset);
261: }
262:
263: public Iterator<ArchiveRecord> iterator() {
264: /**
265: * Override ArchiveRecordIterator so can base returned iterator on
266: * GzippedInputStream iterator.
267: */
268: return new ArchiveRecordIterator() {
269: private GzippedInputStream gis = (GzippedInputStream) getInputStream();
270:
271: private Iterator gzipIterator = this .gis.iterator();
272:
273: protected boolean innerHasNext() {
274: return this .gzipIterator.hasNext();
275: }
276:
277: protected ArchiveRecord innerNext() throws IOException {
278: // Get the positoin before gzipIterator.next moves
279: // it on past the gzip header.
280: long p = this .gis.position();
281: InputStream is = (InputStream) this .gzipIterator
282: .next();
283: return createArchiveRecord(is, p);
284: }
285: };
286: }
287:
288: protected void gotoEOR(ArchiveRecord rec) throws IOException {
289: // TODO
290: }
291: }
292:
293: public static boolean isWARCSuffix(final String f) {
294: return (f == null) ? false
295: : (f.toLowerCase()
296: .endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION)) ? true
297: : (f.toLowerCase()
298: .endsWith(DOT_WARC_FILE_EXTENSION)) ? true
299: : false;
300: }
301: }
|