Source Code Cross Referenced for ArchiveReaderFactory.java in  » Web-Crawler » heritrix » org » archive » io » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.io 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* $Id: ArchiveReaderFactory.java 4977 2007-03-09 23:57:28Z stack-sf $
002:         *
003:         * Created on August 18th, 2006
004:         *
005:         * Copyright (C) 2004 Internet Archive.
006:         *
007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
008:         *
009:         * Heritrix is free software; you can redistribute it and/or modify
010:         * it under the terms of the GNU Lesser Public License as published by
011:         * the Free Software Foundation; either version 2.1 of the License, or
012:         * any later version.
013:         *
014:         * Heritrix is distributed in the hope that it will be useful,
015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017:         * GNU Lesser Public License for more details.
018:         *
019:         * You should have received a copy of the GNU Lesser Public License
020:         * along with Heritrix; if not, write to the Free Software
021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022:         */
023:        package org.archive.io;
024:
025:        import it.unimi.dsi.fastutil.io.RepositionableStream;
026:
027:        import java.io.File;
028:        import java.io.IOException;
029:        import java.io.InputStream;
030:        import java.net.HttpURLConnection;
031:        import java.net.MalformedURLException;
032:        import java.net.URL;
033:        import java.net.URLConnection;
034:
035:        import org.archive.io.arc.ARCReaderFactory;
036:        import org.archive.io.warc.WARCReaderFactory;
037:        import org.archive.net.UURI;
038:        import org.archive.net.md5.Md5URLConnection;
039:        import org.archive.net.rsync.RsyncURLConnection;
040:        import org.archive.util.FileUtils;
041:        import org.archive.util.IoUtils;
042:
043:        /**
044:         * Factory that returns an Archive file Reader.
045:         * Returns Readers for ARCs or WARCs.
046:         * @author stack
047:         * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $
048:         */
049:        public class ArchiveReaderFactory implements  ArchiveFileConstants {
050:            /**
051:             * Offset value for when we want to stream all.
052:             */
053:            private final static int STREAM_ALL = -1;
054:
055:            private static final ArchiveReaderFactory factory = new ArchiveReaderFactory();
056:
057:            /**
058:             * Shutdown any public access to default constructor.
059:             */
060:            protected ArchiveReaderFactory() {
061:                super ();
062:            }
063:
064:            /**
065:             * Get an Archive file Reader on passed path or url.
066:             * Does primitive heuristic figuring if path or URL.
067:             * @param arcFileOrUrl File path or URL pointing at an Archive file.
068:             * @return An Archive file Reader.
069:             * @throws IOException 
070:             * @throws MalformedURLException 
071:             * @throws IOException 
072:             */
073:            public static ArchiveReader get(final String arcFileOrUrl)
074:                    throws MalformedURLException, IOException {
075:                return ArchiveReaderFactory.factory
076:                        .getArchiveReader(arcFileOrUrl);
077:            }
078:
079:            protected ArchiveReader getArchiveReader(final String arcFileOrUrl)
080:                    throws MalformedURLException, IOException {
081:                return getArchiveReader(arcFileOrUrl, STREAM_ALL);
082:            }
083:
084:            protected ArchiveReader getArchiveReader(final String arcFileOrUrl,
085:                    final long offset) throws MalformedURLException,
086:                    IOException {
087:                return UURI.hasScheme(arcFileOrUrl) ? get(
088:                        new URL(arcFileOrUrl), offset) : get(new File(
089:                        arcFileOrUrl), offset);
090:            }
091:
092:            /**
093:             * @param f An Archive file to read.
094:             * @return An ArchiveReader
095:             * @throws IOException 
096:             */
097:            public static ArchiveReader get(final File f) throws IOException {
098:                return ArchiveReaderFactory.factory.getArchiveReader(f);
099:            }
100:
101:            protected ArchiveReader getArchiveReader(final File f)
102:                    throws IOException {
103:                return getArchiveReader(f, 0);
104:            }
105:
106:            /**
107:             * @param f An Archive file to read.
108:             * @param offset Have returned Reader set to start reading at this offset.
109:             * @return An ArchiveReader
110:             * @throws IOException 
111:             */
112:            public static ArchiveReader get(final File f, final long offset)
113:                    throws IOException {
114:                return ArchiveReaderFactory.factory.getArchiveReader(f, offset);
115:            }
116:
117:            protected ArchiveReader getArchiveReader(final File f,
118:                    final long offset) throws IOException {
119:                if (ARCReaderFactory.isARCSuffix(f.getName())) {
120:                    return ARCReaderFactory.get(f, true, offset);
121:                } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
122:                    return WARCReaderFactory.get(f, offset);
123:                }
124:                throw new IOException(
125:                        "Unknown file extension (Not ARC nor WARC): "
126:                                + f.getName());
127:            }
128:
129:            /**
130:             * Wrap a Reader around passed Stream.
131:             * @param s Identifying String for this Stream used in error messages.
132:             * Must be a string that ends with the name of the file we're to put
133:             * an ArchiveReader on.  This code looks at file endings to figure
134:             * whether to return an ARC or WARC reader.
135:             * @param is Stream.  Stream will be wrapped with implementation of
136:             * RepositionableStream unless already supported.
137:             * @param atFirstRecord Are we at first Record?
138:             * @return ArchiveReader.
139:             * @throws IOException
140:             */
141:            public static ArchiveReader get(final String s,
142:                    final InputStream is, final boolean atFirstRecord)
143:                    throws IOException {
144:                return ArchiveReaderFactory.factory.getArchiveReader(s, is,
145:                        atFirstRecord);
146:            }
147:
148:            /**
149:             * @param is
150:             * @return If passed <code>is</code> is
151:             * {@link RepositionableInputStream}, returns <code>is</code>, else we
152:             * wrap <code>is</code> with {@link RepositionableStream}.
153:             */
154:            protected InputStream asRepositionable(final InputStream is) {
155:                if (is instanceof  RepositionableStream) {
156:                    return is;
157:                }
158:                // RepositionableInputStream calls mark on each read so can back up at
159:                // least the read amount.  Needed for gzip inflater overinflations
160:                // reading into the next gzip member.
161:                return new RepositionableInputStream(is, 16 * 1024);
162:            }
163:
164:            protected ArchiveReader getArchiveReader(final String id,
165:                    final InputStream is, final boolean atFirstRecord)
166:                    throws IOException {
167:                final InputStream stream = asRepositionable(is);
168:                if (ARCReaderFactory.isARCSuffix(id)) {
169:                    return ARCReaderFactory.get(id, stream, atFirstRecord);
170:                } else if (WARCReaderFactory.isWARCSuffix(id)) {
171:                    return WARCReaderFactory.get(id, stream, atFirstRecord);
172:                }
173:                throw new IOException("Unknown extension (Not ARC nor WARC): "
174:                        + id);
175:            }
176:
177:            /**
178:             * Get an Archive Reader aligned at <code>offset</code>.
179:             * This version of get will not bring the file local but will try to
180:             * stream across the net making an HTTP 1.1 Range request on remote
181:             * http server (RFC1435 Section 14.35).
182:             * @param u HTTP URL for an Archive file.
183:             * @param offset Offset into file at which to start fetching.
184:             * @return An ArchiveReader aligned at offset.
185:             * @throws IOException
186:             */
187:            public static ArchiveReader get(final URL u, final long offset)
188:                    throws IOException {
189:                return ArchiveReaderFactory.factory.getArchiveReader(u, offset);
190:            }
191:
192:            protected ArchiveReader getArchiveReader(final URL f,
193:                    final long offset) throws IOException {
194:                // Get URL connection.
195:                URLConnection connection = f.openConnection();
196:                if (!(connection instanceof  HttpURLConnection)) {
197:                    throw new IOException(
198:                            "This method only handles HTTP connections.");
199:                }
200:                addUserAgent((HttpURLConnection) connection);
201:                if (offset != STREAM_ALL) {
202:                    // Use a Range request (Assumes HTTP 1.1 on other end). If
203:                    // length >= 0, add open-ended range header to the request.  Else,
204:                    // because end-byte is inclusive, subtract 1.
205:                    connection.addRequestProperty("Range", "bytes=" + offset
206:                            + "-");
207:                }
208:
209:                return getArchiveReader(f.toString(), connection
210:                        .getInputStream(), (offset == 0));
211:            }
212:
213:            /**
214:             * Get an ARCReader.
215:             * Pulls the ARC local into whereever the System Property
216:             * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
217:             * points at this local copy.  A close on this ARCReader instance will
218:             * remove the local copy.
219:             * @param u An URL that points at an ARC.
220:             * @return An ARCReader.
221:             * @throws IOException 
222:             */
223:            public static ArchiveReader get(final URL u) throws IOException {
224:                return ArchiveReaderFactory.factory.getArchiveReader(u);
225:            }
226:
227:            protected ArchiveReader getArchiveReader(final URL u)
228:                    throws IOException {
229:                // If url represents a local file then return file it points to.
230:                if (u.getPath() != null) {
231:                    // TODO: Add scheme check and host check.
232:                    File f = new File(u.getPath());
233:                    if (f.exists()) {
234:                        return get(f, 0);
235:                    }
236:                }
237:
238:                String scheme = u.getProtocol();
239:                if (scheme.startsWith("http") || scheme.equals("s3")) {
240:                    // Try streaming if http or s3 URLs rather than copying local
241:                    // and then reading (Passing an offset will get us an Reader
242:                    // that wraps a Stream).
243:                    return get(u, STREAM_ALL);
244:                }
245:
246:                return makeARCLocal(u.openConnection());
247:            }
248:
249:            protected ArchiveReader makeARCLocal(final URLConnection connection)
250:                    throws IOException {
251:                File localFile = null;
252:                if (connection instanceof  HttpURLConnection) {
253:                    // If http url connection, bring down the resource local.
254:                    String p = connection.getURL().getPath();
255:                    int index = p.lastIndexOf('/');
256:                    if (index >= 0) {
257:                        // Name file for the file we're making local.
258:                        localFile = new File(FileUtils.TMPDIR, p
259:                                .substring(index + 1));
260:                        if (localFile.exists()) {
261:                            // If file of same name already exists in TMPDIR, then
262:                            // clean it up (Assuming only reason a file of same name in
263:                            // TMPDIR is because we failed a previous download).
264:                            localFile.delete();
265:                        }
266:                    } else {
267:                        localFile = File.createTempFile(ArchiveReader.class
268:                                .getName(), ".tmp", FileUtils.TMPDIR);
269:                    }
270:                    addUserAgent((HttpURLConnection) connection);
271:                    connection.connect();
272:                    try {
273:                        IoUtils.readFullyToFile(connection.getInputStream(),
274:                                localFile, new byte[16 * 1024]);
275:                    } catch (IOException ioe) {
276:                        localFile.delete();
277:                        throw ioe;
278:                    }
279:                } else if (connection instanceof  RsyncURLConnection) {
280:                    // Then, connect and this will create a local file.
281:                    // See implementation of the rsync handler.
282:                    connection.connect();
283:                    localFile = ((RsyncURLConnection) connection).getFile();
284:                } else if (connection instanceof  Md5URLConnection) {
285:                    // Then, connect and this will create a local file.
286:                    // See implementation of the md5 handler.
287:                    connection.connect();
288:                    localFile = ((Md5URLConnection) connection).getFile();
289:                } else {
290:                    throw new UnsupportedOperationException("No support for "
291:                            + connection);
292:                }
293:
294:                ArchiveReader reader = null;
295:                try {
296:                    reader = get(localFile, 0);
297:                } catch (IOException e) {
298:                    localFile.delete();
299:                    throw e;
300:                }
301:
302:                // Return a delegate that does cleanup of downloaded file on close.
303:                return reader.getDeleteFileOnCloseReader(localFile);
304:            }
305:
306:            protected void addUserAgent(final HttpURLConnection connection) {
307:                connection.addRequestProperty("User-Agent", this .getClass()
308:                        .getName());
309:            }
310:
311:            /**
312:             * @param f File to test.
313:             * @return True if <code>f</code> is compressed.
314:             * @throws IOException
315:             */
316:            protected boolean isCompressed(final File f) throws IOException {
317:                return f.getName().toLowerCase().endsWith(
318:                        DOT_COMPRESSED_FILE_EXTENSION);
319:            }
320:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.