001: /* $Id: WARCReaderFactory.java 4533 2006-08-24 00:59:04Z stack-sf $
002: *
003: * Created Aug 22, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.warc.v10;
024:
025: import java.io.File;
026: import java.io.FileInputStream;
027: import java.io.IOException;
028: import java.io.InputStream;
029: import java.net.MalformedURLException;
030: import java.net.URL;
031: import java.util.Iterator;
032:
033: import org.archive.io.ArchiveReader;
034: import org.archive.io.ArchiveReaderFactory;
035: import org.archive.io.ArchiveRecord;
036: import org.archive.io.GzippedInputStream;
037: import org.archive.io.warc.WARCConstants;
038: import org.archive.util.FileUtils;
039: import org.archive.net.UURI;
040:
041: /**
042: * Factory for WARC Readers.
043: * Figures whether to give out a compressed file Reader or an uncompressed
044: * Reader.
045: * @author stack
046: * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$
047: */
048: public class WARCReaderFactory extends ArchiveReaderFactory implements
049: WARCConstants {
050: private static final WARCReaderFactory factory = new WARCReaderFactory();
051:
052: /**
053: * Shutdown any access to default constructor.
054: * This factory is Singleton.
055: */
056: private WARCReaderFactory() {
057: super ();
058: }
059:
060: public static WARCReader get(String arcFileOrUrl)
061: throws MalformedURLException, IOException {
062: return (WARCReader) WARCReaderFactory.factory
063: .getArchiveReader(arcFileOrUrl);
064: }
065:
066: public static WARCReader get(final File f) throws IOException {
067: return (WARCReader) WARCReaderFactory.factory
068: .getArchiveReader(f);
069: }
070:
071: /**
072: * @param f An arcfile to read.
073: * @param offset Have returned Reader set to start reading at this offset.
074: * @return A WARCReader.
075: * @throws IOException
076: */
077: public static WARCReader get(final File f, final long offset)
078: throws IOException {
079: return (WARCReader) WARCReaderFactory.factory.getArchiveReader(
080: f, offset);
081: }
082:
083: protected ArchiveReader getArchiveReader(final String arcFileOrUrl,
084: final long offset) throws MalformedURLException,
085: IOException {
086: return UURI.hasScheme(arcFileOrUrl) ? get(
087: new URL(arcFileOrUrl), offset) : get(new File(
088: arcFileOrUrl), offset);
089: }
090:
091: protected ArchiveReader getArchiveReader(final File f,
092: final long offset) throws IOException {
093: boolean compressed = testCompressedWARCFile(f);
094: if (!compressed) {
095: if (!FileUtils.isReadableWithExtensionAndMagic(f,
096: DOT_WARC_FILE_EXTENSION, WARC_010_MAGIC)) {
097: throw new IOException(f.getAbsolutePath()
098: + " is not a WARC file.");
099: }
100: }
101: return (WARCReader) (compressed ? WARCReaderFactory.factory.new CompressedWARCReader(
102: f, offset)
103: : WARCReaderFactory.factory.new UncompressedWARCReader(
104: f, offset));
105: }
106:
107: public static ArchiveReader get(final String s,
108: final InputStream is, final boolean atFirstRecord)
109: throws IOException {
110: return WARCReaderFactory.factory.getArchiveReader(s, is,
111: atFirstRecord);
112: }
113:
114: protected ArchiveReader getArchiveReader(final String f,
115: final InputStream is, final boolean atFirstRecord)
116: throws IOException {
117: // For now, assume stream is compressed. Later add test of input
118: // stream or handle exception thrown when figure not compressed stream.
119: return new CompressedWARCReader(f, is, atFirstRecord);
120: }
121:
122: public static WARCReader get(final URL arcUrl, final long offset)
123: throws IOException {
124: return (WARCReader) WARCReaderFactory.factory.getArchiveReader(
125: arcUrl, offset);
126: }
127:
128: /**
129: * Get an ARCReader.
130: * Pulls the ARC local into whereever the System Property
131: * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
132: * points at this local copy. A close on this ARCReader instance will
133: * remove the local copy.
134: * @param arcUrl An URL that points at an ARC.
135: * @return An ARCReader.
136: * @throws IOException
137: */
138: public static WARCReader get(final URL arcUrl) throws IOException {
139: return (WARCReader) WARCReaderFactory.factory
140: .getArchiveReader(arcUrl);
141: }
142:
143: /**
144: * Check file is compressed WARC.
145: *
146: * @param f File to test.
147: *
148: * @return True if this is compressed WARC (TODO: Just tests if file is
149: * GZIP'd file (It begins w/ GZIP MAGIC)).
150: *
151: * @exception IOException If file does not exist or is not unreadable.
152: */
153: public static boolean testCompressedWARCFile(final File f)
154: throws IOException {
155: FileUtils.isReadable(f);
156: boolean compressed = false;
157: final InputStream is = new FileInputStream(f);
158: try {
159: compressed = GzippedInputStream.isCompressedStream(is);
160: } finally {
161: is.close();
162: }
163: return compressed;
164: }
165:
166: /**
167: * Uncompressed WARC file reader.
168: * @author stack
169: */
170: private class UncompressedWARCReader extends WARCReader {
171: /**
172: * Constructor.
173: * @param f Uncompressed arcfile to read.
174: * @throws IOException
175: */
176: public UncompressedWARCReader(final File f) throws IOException {
177: this (f, 0);
178: }
179:
180: /**
181: * Constructor.
182: *
183: * @param f Uncompressed file to read.
184: * @param offset Offset at which to position Reader.
185: * @throws IOException
186: */
187: public UncompressedWARCReader(final File f, final long offset)
188: throws IOException {
189: // File has been tested for existence by time it has come to here.
190: setIn(getInputStream(f, offset));
191: initialize(f.getAbsolutePath());
192: }
193:
194: /**
195: * Constructor.
196: *
197: * @param f Uncompressed file to read.
198: * @param is InputStream.
199: */
200: public UncompressedWARCReader(final String f,
201: final InputStream is) {
202: // Arc file has been tested for existence by time it has come
203: // to here.
204: setIn(is);
205: initialize(f);
206: }
207: }
208:
209: /**
210: * Compressed WARC file reader.
211: *
212: * @author stack
213: */
214: private class CompressedWARCReader extends WARCReader {
215: /**
216: * Constructor.
217: *
218: * @param f Compressed file to read.
219: * @throws IOException
220: */
221: public CompressedWARCReader(final File f) throws IOException {
222: this (f, 0);
223: }
224:
225: /**
226: * Constructor.
227: *
228: * @param f Compressed arcfile to read.
229: * @param offset Position at where to start reading file.
230: * @throws IOException
231: */
232: public CompressedWARCReader(final File f, final long offset)
233: throws IOException {
234: // File has been tested for existence by time it has come to here.
235: setIn(new GzippedInputStream(getInputStream(f, offset)));
236: setCompressed((offset == 0));
237: initialize(f.getAbsolutePath());
238: }
239:
240: /**
241: * Constructor.
242: *
243: * @param f Compressed arcfile.
244: * @param is InputStream to use.
245: * @param atFirstRecord
246: * @throws IOException
247: */
248: public CompressedWARCReader(final String f,
249: final InputStream is, final boolean atFirstRecord)
250: throws IOException {
251: // Arc file has been tested for existence by time it has come
252: // to here.
253: setIn(new GzippedInputStream(is));
254: setCompressed(true);
255: initialize(f);
256: // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world.
257: }
258:
259: /**
260: * Get record at passed <code>offset</code>.
261: *
262: * @param offset Byte index into file at which a record starts.
263: * @return A WARCRecord reference.
264: * @throws IOException
265: */
266: public WARCRecord get(long offset) throws IOException {
267: cleanupCurrentRecord();
268: ((GzippedInputStream) getIn()).gzipMemberSeek(offset);
269: return (WARCRecord) createArchiveRecord(getIn(), offset);
270: }
271:
272: public Iterator<ArchiveRecord> iterator() {
273: /**
274: * Override ArchiveRecordIterator so can base returned iterator on
275: * GzippedInputStream iterator.
276: */
277: return new ArchiveRecordIterator() {
278: private GzippedInputStream gis = (GzippedInputStream) getInputStream();
279:
280: private Iterator gzipIterator = this .gis.iterator();
281:
282: protected boolean innerHasNext() {
283: return this .gzipIterator.hasNext();
284: }
285:
286: protected ArchiveRecord innerNext() throws IOException {
287: // Get the positoin before gzipIterator.next moves
288: // it on past the gzip header.
289: long p = this .gis.position();
290: InputStream is = (InputStream) this .gzipIterator
291: .next();
292: return createArchiveRecord(is, p);
293: }
294: };
295: }
296:
297: protected void gotoEOR(ArchiveRecord rec) throws IOException {
298: // TODO
299: }
300: }
301:
302: public static boolean isWARCSuffix(final String f) {
303: return (f == null) ? false
304: : (f.toLowerCase()
305: .endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION)) ? true
306: : (f.toLowerCase()
307: .endsWith(DOT_WARC_FILE_EXTENSION)) ? true
308: : false;
309: }
310: }
|