001: /* ARCUtils
002: *
003: * Created on Aug 10, 2005
004: *
005: * Copyright (C) 2005 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io.arc;
024:
025: import it.unimi.dsi.fastutil.io.RepositionableStream;
026:
027: import java.io.File;
028: import java.io.FileInputStream;
029: import java.io.FileNotFoundException;
030: import java.io.IOException;
031: import java.io.InputStream;
032: import java.net.URI;
033: import java.net.URISyntaxException;
034:
035: import org.archive.io.GzipHeader;
036: import org.archive.io.NoGzipMagicException;
037: import org.archive.net.UURI;
038:
039: public class ARCUtils implements ARCConstants {
040: /**
041: * @param pathOrUri Path or URI to extract arc filename from.
042: * @return Extracted arc file name.
043: * @throws URISyntaxException
044: */
045: public static String parseArcFilename(final String pathOrUri)
046: throws URISyntaxException {
047: String path = pathOrUri;
048: if (UURI.hasScheme(pathOrUri)) {
049: URI url = new URI(pathOrUri);
050: path = url.getPath();
051: }
052: return (new File(path)).getName();
053: }
054:
055: /**
056: * @param arcFile File to test.
057: * @return True if <code>arcFile</code> is compressed ARC.
058: * @throws IOException
059: */
060: public static boolean isCompressed(File arcFile) throws IOException {
061: return testCompressedARCFile(arcFile);
062: }
063:
064: /**
065: * Check file is compressed and in ARC GZIP format.
066: *
067: * @param arcFile File to test if its Internet Archive ARC file
068: * GZIP compressed.
069: *
070: * @return True if this is an Internet Archive GZIP'd ARC file (It begins
071: * w/ the Internet Archive GZIP header and has the
072: * COMPRESSED_ARC_FILE_EXTENSION suffix).
073: *
074: * @exception IOException If file does not exist or is not unreadable.
075: */
076: public static boolean testCompressedARCFile(File arcFile)
077: throws IOException {
078: return testCompressedARCFile(arcFile, false);
079: }
080:
081: /**
082: * Check file is compressed and in ARC GZIP format.
083: *
084: * @param arcFile File to test if its Internet Archive ARC file
085: * GZIP compressed.
086: * @param skipSuffixCheck Set to true if we're not to test on the
087: * '.arc.gz' suffix.
088: *
089: * @return True if this is an Internet Archive GZIP'd ARC file (It begins
090: * w/ the Internet Archive GZIP header).
091: *
092: * @exception IOException If file does not exist or is not unreadable.
093: */
094: public static boolean testCompressedARCFile(File arcFile,
095: boolean skipSuffixCheck) throws IOException {
096: boolean compressedARCFile = false;
097: isReadable(arcFile);
098: if (!skipSuffixCheck
099: && !arcFile.getName().toLowerCase().endsWith(
100: COMPRESSED_ARC_FILE_EXTENSION)) {
101: return compressedARCFile;
102: }
103:
104: final InputStream is = new FileInputStream(arcFile);
105: try {
106: compressedARCFile = testCompressedARCStream(is);
107: } finally {
108: is.close();
109: }
110: return compressedARCFile;
111: }
112:
113: /**
114: * Tests passed stream is gzip stream by reading in the HEAD.
115: * Does not reposition the stream. That is left up to the caller.
116: * @param is An InputStream.
117: * @return True if compressed stream.
118: * @throws IOException
119: */
120: public static boolean testCompressedARCStream(final InputStream is)
121: throws IOException {
122: boolean compressedARCFile = false;
123: GzipHeader gh = null;
124: try {
125: gh = new GzipHeader(is);
126: } catch (NoGzipMagicException e) {
127: return compressedARCFile;
128: }
129:
130: byte[] fextra = gh.getFextra();
131: // Now make sure following bytes are IA GZIP comment.
132: // First check length. ARC_GZIP_EXTRA_FIELD includes length
133: // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
134: // at +2.
135: if (fextra != null
136: && ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
137: compressedARCFile = true;
138: for (int i = 0; i < fextra.length; i++) {
139: if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
140: compressedARCFile = false;
141: break;
142: }
143: }
144: }
145: return compressedARCFile;
146: }
147:
148: /**
149: * Tests passed stream is gzip stream by reading in the HEAD.
150: * Does reposition of stream when done.
151: * @param rs An InputStream that is Repositionable.
152: * @return True if compressed stream.
153: * @throws IOException
154: */
155: public static boolean testCompressedRepositionalStream(
156: final RepositionableStream rs) throws IOException {
157: boolean compressedARCFile = false;
158: long p = rs.position();
159: try {
160: compressedARCFile = testCompressedStream((InputStream) rs);
161: } finally {
162: rs.position(p);
163: }
164: return compressedARCFile;
165: }
166:
167: /**
168: * Tests passed stream is gzip stream by reading in the HEAD.
169: * Does reposition of stream when done.
170: * @param is An InputStream.
171: * @return True if compressed stream.
172: * @throws IOException
173: */
174: public static boolean testCompressedStream(final InputStream is)
175: throws IOException {
176: boolean compressedARCFile = false;
177: try {
178: new GzipHeader(is);
179: compressedARCFile = true;
180: } catch (NoGzipMagicException e) {
181: return compressedARCFile;
182: }
183: return compressedARCFile;
184: }
185:
186: /**
187: * Check file is uncompressed ARC file.
188: *
189: * @param arcFile
190: * File to test if its Internet Archive ARC file uncompressed.
191: *
192: * @return True if this is an Internet Archive ARC file.
193: *
194: * @exception IOException
195: * If file does not exist or is not unreadable.
196: */
197: public static boolean testUncompressedARCFile(File arcFile)
198: throws IOException {
199: boolean uncompressedARCFile = false;
200: isReadable(arcFile);
201: if (arcFile.getName().toLowerCase()
202: .endsWith(ARC_FILE_EXTENSION)) {
203: FileInputStream fis = new FileInputStream(arcFile);
204: try {
205: byte[] b = new byte[ARC_MAGIC_NUMBER.length()];
206: int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length());
207: fis.close();
208: if (read == ARC_MAGIC_NUMBER.length()) {
209: StringBuffer beginStr = new StringBuffer(
210: ARC_MAGIC_NUMBER.length());
211: for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) {
212: beginStr.append((char) b[i]);
213: }
214:
215: if (beginStr.toString().equalsIgnoreCase(
216: ARC_MAGIC_NUMBER)) {
217: uncompressedARCFile = true;
218: }
219: }
220: } finally {
221: fis.close();
222: }
223: }
224:
225: return uncompressedARCFile;
226: }
227:
228: /**
229: * @param arcFile File to test.
230: * @exception IOException If file does not exist or is not unreadable.
231: */
232: private static void isReadable(File arcFile) throws IOException {
233: if (!arcFile.exists()) {
234: throw new FileNotFoundException(arcFile.getAbsolutePath()
235: + " does not exist.");
236: }
237:
238: if (!arcFile.canRead()) {
239: throw new FileNotFoundException(arcFile.getAbsolutePath()
240: + " is not readable.");
241: }
242: }
243: }
|