001: /* $Id: ArchiveReaderFactory.java 4977 2007-03-09 23:57:28Z stack-sf $
002: *
003: * Created on August 18th, 2006
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io;
024:
025: import it.unimi.dsi.fastutil.io.RepositionableStream;
026:
027: import java.io.File;
028: import java.io.IOException;
029: import java.io.InputStream;
030: import java.net.HttpURLConnection;
031: import java.net.MalformedURLException;
032: import java.net.URL;
033: import java.net.URLConnection;
034:
035: import org.archive.io.arc.ARCReaderFactory;
036: import org.archive.io.warc.WARCReaderFactory;
037: import org.archive.net.UURI;
038: import org.archive.net.md5.Md5URLConnection;
039: import org.archive.net.rsync.RsyncURLConnection;
040: import org.archive.util.FileUtils;
041: import org.archive.util.IoUtils;
042:
043: /**
044: * Factory that returns an Archive file Reader.
045: * Returns Readers for ARCs or WARCs.
046: * @author stack
047: * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $
048: */
049: public class ArchiveReaderFactory implements ArchiveFileConstants {
050: /**
051: * Offset value for when we want to stream all.
052: */
053: private final static int STREAM_ALL = -1;
054:
055: private static final ArchiveReaderFactory factory = new ArchiveReaderFactory();
056:
057: /**
058: * Shutdown any public access to default constructor.
059: */
060: protected ArchiveReaderFactory() {
061: super ();
062: }
063:
064: /**
065: * Get an Archive file Reader on passed path or url.
066: * Does primitive heuristic figuring if path or URL.
067: * @param arcFileOrUrl File path or URL pointing at an Archive file.
068: * @return An Archive file Reader.
069: * @throws IOException
070: * @throws MalformedURLException
071: * @throws IOException
072: */
073: public static ArchiveReader get(final String arcFileOrUrl)
074: throws MalformedURLException, IOException {
075: return ArchiveReaderFactory.factory
076: .getArchiveReader(arcFileOrUrl);
077: }
078:
079: protected ArchiveReader getArchiveReader(final String arcFileOrUrl)
080: throws MalformedURLException, IOException {
081: return getArchiveReader(arcFileOrUrl, STREAM_ALL);
082: }
083:
084: protected ArchiveReader getArchiveReader(final String arcFileOrUrl,
085: final long offset) throws MalformedURLException,
086: IOException {
087: return UURI.hasScheme(arcFileOrUrl) ? get(
088: new URL(arcFileOrUrl), offset) : get(new File(
089: arcFileOrUrl), offset);
090: }
091:
092: /**
093: * @param f An Archive file to read.
094: * @return An ArchiveReader
095: * @throws IOException
096: */
097: public static ArchiveReader get(final File f) throws IOException {
098: return ArchiveReaderFactory.factory.getArchiveReader(f);
099: }
100:
101: protected ArchiveReader getArchiveReader(final File f)
102: throws IOException {
103: return getArchiveReader(f, 0);
104: }
105:
106: /**
107: * @param f An Archive file to read.
108: * @param offset Have returned Reader set to start reading at this offset.
109: * @return An ArchiveReader
110: * @throws IOException
111: */
112: public static ArchiveReader get(final File f, final long offset)
113: throws IOException {
114: return ArchiveReaderFactory.factory.getArchiveReader(f, offset);
115: }
116:
117: protected ArchiveReader getArchiveReader(final File f,
118: final long offset) throws IOException {
119: if (ARCReaderFactory.isARCSuffix(f.getName())) {
120: return ARCReaderFactory.get(f, true, offset);
121: } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
122: return WARCReaderFactory.get(f, offset);
123: }
124: throw new IOException(
125: "Unknown file extension (Not ARC nor WARC): "
126: + f.getName());
127: }
128:
129: /**
130: * Wrap a Reader around passed Stream.
131: * @param s Identifying String for this Stream used in error messages.
132: * Must be a string that ends with the name of the file we're to put
133: * an ArchiveReader on. This code looks at file endings to figure
134: * whether to return an ARC or WARC reader.
135: * @param is Stream. Stream will be wrapped with implementation of
136: * RepositionableStream unless already supported.
137: * @param atFirstRecord Are we at first Record?
138: * @return ArchiveReader.
139: * @throws IOException
140: */
141: public static ArchiveReader get(final String s,
142: final InputStream is, final boolean atFirstRecord)
143: throws IOException {
144: return ArchiveReaderFactory.factory.getArchiveReader(s, is,
145: atFirstRecord);
146: }
147:
148: /**
149: * @param is
150: * @return If passed <code>is</code> is
151: * {@link RepositionableInputStream}, returns <code>is</code>, else we
152: * wrap <code>is</code> with {@link RepositionableStream}.
153: */
154: protected InputStream asRepositionable(final InputStream is) {
155: if (is instanceof RepositionableStream) {
156: return is;
157: }
158: // RepositionableInputStream calls mark on each read so can back up at
159: // least the read amount. Needed for gzip inflater overinflations
160: // reading into the next gzip member.
161: return new RepositionableInputStream(is, 16 * 1024);
162: }
163:
164: protected ArchiveReader getArchiveReader(final String id,
165: final InputStream is, final boolean atFirstRecord)
166: throws IOException {
167: final InputStream stream = asRepositionable(is);
168: if (ARCReaderFactory.isARCSuffix(id)) {
169: return ARCReaderFactory.get(id, stream, atFirstRecord);
170: } else if (WARCReaderFactory.isWARCSuffix(id)) {
171: return WARCReaderFactory.get(id, stream, atFirstRecord);
172: }
173: throw new IOException("Unknown extension (Not ARC nor WARC): "
174: + id);
175: }
176:
177: /**
178: * Get an Archive Reader aligned at <code>offset</code>.
179: * This version of get will not bring the file local but will try to
180: * stream across the net making an HTTP 1.1 Range request on remote
181: * http server (RFC1435 Section 14.35).
182: * @param u HTTP URL for an Archive file.
183: * @param offset Offset into file at which to start fetching.
184: * @return An ArchiveReader aligned at offset.
185: * @throws IOException
186: */
187: public static ArchiveReader get(final URL u, final long offset)
188: throws IOException {
189: return ArchiveReaderFactory.factory.getArchiveReader(u, offset);
190: }
191:
192: protected ArchiveReader getArchiveReader(final URL f,
193: final long offset) throws IOException {
194: // Get URL connection.
195: URLConnection connection = f.openConnection();
196: if (!(connection instanceof HttpURLConnection)) {
197: throw new IOException(
198: "This method only handles HTTP connections.");
199: }
200: addUserAgent((HttpURLConnection) connection);
201: if (offset != STREAM_ALL) {
202: // Use a Range request (Assumes HTTP 1.1 on other end). If
203: // length >= 0, add open-ended range header to the request. Else,
204: // because end-byte is inclusive, subtract 1.
205: connection.addRequestProperty("Range", "bytes=" + offset
206: + "-");
207: }
208:
209: return getArchiveReader(f.toString(), connection
210: .getInputStream(), (offset == 0));
211: }
212:
213: /**
214: * Get an ARCReader.
215: * Pulls the ARC local into whereever the System Property
216: * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
217: * points at this local copy. A close on this ARCReader instance will
218: * remove the local copy.
219: * @param u An URL that points at an ARC.
220: * @return An ARCReader.
221: * @throws IOException
222: */
223: public static ArchiveReader get(final URL u) throws IOException {
224: return ArchiveReaderFactory.factory.getArchiveReader(u);
225: }
226:
227: protected ArchiveReader getArchiveReader(final URL u)
228: throws IOException {
229: // If url represents a local file then return file it points to.
230: if (u.getPath() != null) {
231: // TODO: Add scheme check and host check.
232: File f = new File(u.getPath());
233: if (f.exists()) {
234: return get(f, 0);
235: }
236: }
237:
238: String scheme = u.getProtocol();
239: if (scheme.startsWith("http") || scheme.equals("s3")) {
240: // Try streaming if http or s3 URLs rather than copying local
241: // and then reading (Passing an offset will get us an Reader
242: // that wraps a Stream).
243: return get(u, STREAM_ALL);
244: }
245:
246: return makeARCLocal(u.openConnection());
247: }
248:
249: protected ArchiveReader makeARCLocal(final URLConnection connection)
250: throws IOException {
251: File localFile = null;
252: if (connection instanceof HttpURLConnection) {
253: // If http url connection, bring down the resource local.
254: String p = connection.getURL().getPath();
255: int index = p.lastIndexOf('/');
256: if (index >= 0) {
257: // Name file for the file we're making local.
258: localFile = new File(FileUtils.TMPDIR, p
259: .substring(index + 1));
260: if (localFile.exists()) {
261: // If file of same name already exists in TMPDIR, then
262: // clean it up (Assuming only reason a file of same name in
263: // TMPDIR is because we failed a previous download).
264: localFile.delete();
265: }
266: } else {
267: localFile = File.createTempFile(ArchiveReader.class
268: .getName(), ".tmp", FileUtils.TMPDIR);
269: }
270: addUserAgent((HttpURLConnection) connection);
271: connection.connect();
272: try {
273: IoUtils.readFullyToFile(connection.getInputStream(),
274: localFile, new byte[16 * 1024]);
275: } catch (IOException ioe) {
276: localFile.delete();
277: throw ioe;
278: }
279: } else if (connection instanceof RsyncURLConnection) {
280: // Then, connect and this will create a local file.
281: // See implementation of the rsync handler.
282: connection.connect();
283: localFile = ((RsyncURLConnection) connection).getFile();
284: } else if (connection instanceof Md5URLConnection) {
285: // Then, connect and this will create a local file.
286: // See implementation of the md5 handler.
287: connection.connect();
288: localFile = ((Md5URLConnection) connection).getFile();
289: } else {
290: throw new UnsupportedOperationException("No support for "
291: + connection);
292: }
293:
294: ArchiveReader reader = null;
295: try {
296: reader = get(localFile, 0);
297: } catch (IOException e) {
298: localFile.delete();
299: throw e;
300: }
301:
302: // Return a delegate that does cleanup of downloaded file on close.
303: return reader.getDeleteFileOnCloseReader(localFile);
304: }
305:
306: protected void addUserAgent(final HttpURLConnection connection) {
307: connection.addRequestProperty("User-Agent", this .getClass()
308: .getName());
309: }
310:
311: /**
312: * @param f File to test.
313: * @return True if <code>f</code> is compressed.
314: * @throws IOException
315: */
316: protected boolean isCompressed(final File f) throws IOException {
317: return f.getName().toLowerCase().endsWith(
318: DOT_COMPRESSED_FILE_EXTENSION);
319: }
320: }
|