001: /* GzipHeader
002: *
003: * $Id: GzipHeader.java 4064 2005-12-20 18:11:33Z stack-sf $
004: *
005: * Created on July 5, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.io;
026:
027: import java.io.EOFException;
028: import java.io.IOException;
029: import java.io.InputStream;
030: import java.util.zip.CRC32;
031: import java.util.zip.Deflater;
032: import java.util.zip.GZIPInputStream;
033:
034: /**
035: * Read in the GZIP header.
036: *
037: * See RFC1952 for specification on what the header looks like.
038: * Assumption is that stream is cued-up with the gzip header as the
039: * next thing to be read.
040: *
041: * <p>Of <a href="http://jguru.com/faq/view.jsp?EID=13647">Java
042: * and unsigned bytes</a>. That is, its always a signed int in
043: * java no matter what the qualifier whether byte, char, etc.
044: *
045: * <p>Add accessors for optional filename, comment and MTIME.
046: *
047: * @author stack
048: */
049: public class GzipHeader {
050: /**
051: * Length of minimal GZIP header.
052: *
053: * See RFC1952 for explaination of value of 10.
054: */
055: public static final int MINIMAL_GZIP_HEADER_LENGTH = 10;
056:
057: /**
058: * Total length of the gzip header.
059: */
060: protected int length = 0;
061:
062: /**
063: * The GZIP header FLG byte.
064: */
065: protected int flg;
066:
067: /**
068: * GZIP header XFL byte.
069: */
070: private int xfl;
071:
072: /**
073: * GZIP header OS byte.
074: */
075: private int os;
076:
077: /**
078: * Extra header field content.
079: */
080: private byte[] fextra = null;
081:
082: /**
083: * GZIP header MTIME field.
084: */
085: private int mtime;
086:
087: /**
088: * Shutdown constructor.
089: *
090: * Must pass an input stream.
091: */
092: public GzipHeader() {
093: super ();
094: }
095:
096: /**
097: * Constructor.
098: *
099: * This constructor advances the stream past any gzip header found.
100: *
101: * @param in InputStream to read from.
102: * @throws IOException
103: */
104: public GzipHeader(InputStream in) throws IOException {
105: super ();
106: readHeader(in);
107: }
108:
109: /**
110: * Read in gzip header.
111: *
112: * Advances the stream past the gzip header.
113: * @param in InputStream.
114: *
115: * @throws IOException Throws if does not start with GZIP Header.
116: */
117: public void readHeader(InputStream in) throws IOException {
118: CRC32 crc = new CRC32();
119: crc.reset();
120: if (!testGzipMagic(in, crc)) {
121: throw new NoGzipMagicException();
122: }
123: this .length += 2;
124: if (readByte(in, crc) != Deflater.DEFLATED) {
125: throw new IOException("Unknown compression");
126: }
127: this .length++;
128:
129: // Get gzip header flag.
130: this .flg = readByte(in, crc);
131: this .length++;
132:
133: // Get MTIME.
134: this .mtime = readInt(in, crc);
135: this .length += 4;
136:
137: // Read XFL and OS.
138: this .xfl = readByte(in, crc);
139: this .length++;
140: this .os = readByte(in, crc);
141: this .length++;
142:
143: // Skip optional extra field -- stuff w/ alexa stuff in it.
144: final int FLG_FEXTRA = 4;
145: if ((this .flg & FLG_FEXTRA) == FLG_FEXTRA) {
146: int count = readShort(in, crc);
147: this .length += 2;
148: this .fextra = new byte[count];
149: readByte(in, crc, this .fextra, 0, count);
150: this .length += count;
151: }
152:
153: // Skip file name. It ends in null.
154: final int FLG_FNAME = 8;
155: if ((this .flg & FLG_FNAME) == FLG_FNAME) {
156: while (readByte(in, crc) != 0) {
157: this .length++;
158: }
159: }
160:
161: // Skip file comment. It ends in null.
162: final int FLG_FCOMMENT = 16; // File comment
163: if ((this .flg & FLG_FCOMMENT) == FLG_FCOMMENT) {
164: while (readByte(in, crc) != 0) {
165: this .length++;
166: }
167: }
168:
169: // Check optional CRC.
170: final int FLG_FHCRC = 2;
171: if ((this .flg & FLG_FHCRC) == FLG_FHCRC) {
172: int calcCrc = (int) (crc.getValue() & 0xffff);
173: if (readShort(in, crc) != calcCrc) {
174: throw new IOException("Bad header CRC");
175: }
176: this .length += 2;
177: }
178: }
179:
180: /**
181: * Test gzip magic is next in the stream.
182: * Reads two bytes. Caller needs to manage resetting stream.
183: * @param in InputStream to read.
184: * @return true if found gzip magic. False otherwise
185: * or an IOException (including EOFException).
186: * @throws IOException
187: */
188: public boolean testGzipMagic(InputStream in) throws IOException {
189: return testGzipMagic(in, null);
190: }
191:
192: /**
193: * Test gzip magic is next in the stream.
194: * Reads two bytes. Caller needs to manage resetting stream.
195: * @param in InputStream to read.
196: * @param crc CRC to update.
197: * @return true if found gzip magic. False otherwise
198: * or an IOException (including EOFException).
199: * @throws IOException
200: */
201: public boolean testGzipMagic(InputStream in, CRC32 crc)
202: throws IOException {
203: return readShort(in, crc) == GZIPInputStream.GZIP_MAGIC;
204: }
205:
206: /**
207: * Read an int.
208: *
209: * We do not expect to get a -1 reading. If we do, we throw exception.
210: * Update the crc as we go.
211: *
212: * @param in InputStream to read.
213: * @param crc CRC to update.
214: * @return int read.
215: *
216: * @throws IOException
217: */
218: private int readInt(InputStream in, CRC32 crc) throws IOException {
219: int s = readShort(in, crc);
220: return ((readShort(in, crc) << 16) & 0xffff0000) | s;
221: }
222:
223: /**
224: * Read a short.
225: *
226: * We do not expect to get a -1 reading. If we do, we throw exception.
227: * Update the crc as we go.
228: *
229: * @param in InputStream to read.
230: * @param crc CRC to update.
231: * @return Short read.
232: *
233: * @throws IOException
234: */
235: private int readShort(InputStream in, CRC32 crc) throws IOException {
236: int b = readByte(in, crc);
237: return ((readByte(in, crc) << 8) & 0x00ff00) | b;
238: }
239:
240: /**
241: * Read a byte.
242: *
243: * We do not expect to get a -1 reading. If we do, we throw exception.
244: * Update the crc as we go.
245: *
246: * @param in InputStream to read.
247: * @return Byte read.
248: *
249: * @throws IOException
250: */
251: protected int readByte(InputStream in) throws IOException {
252: return readByte(in, null);
253: }
254:
255: /**
256: * Read a byte.
257: *
258: * We do not expect to get a -1 reading. If we do, we throw exception.
259: * Update the crc as we go.
260: *
261: * @param in InputStream to read.
262: * @param crc CRC to update.
263: * @return Byte read.
264: *
265: * @throws IOException
266: */
267: protected int readByte(InputStream in, CRC32 crc)
268: throws IOException {
269: int b = in.read();
270: if (b == -1) {
271: throw new EOFException();
272: }
273: if (crc != null) {
274: crc.update(b);
275: }
276: return b & 0xff;
277: }
278:
279: /**
280: * Read a byte.
281: *
282: * We do not expect to get a -1 reading. If we do, we throw exception.
283: * Update the crc as we go.
284: *
285: * @param in InputStream to read.
286: * @param crc CRC to update.
287: * @param buffer Buffer to read into.
288: * @param offset Offset to start filling buffer at.
289: * @param length How much to read.
290: * @return Bytes read.
291: *
292: * @throws IOException
293: */
294: protected int readByte(InputStream in, CRC32 crc, byte[] buffer,
295: int offset, int length) throws IOException {
296: for (int i = offset; i < length; i++) {
297: buffer[offset + i] = (byte) readByte(in, crc);
298: }
299: return length;
300: }
301:
302: /**
303: * @return Returns the fextra.
304: */
305: public byte[] getFextra() {
306: return this .fextra;
307: }
308:
309: /**
310: * @return Returns the flg.
311: */
312: public int getFlg() {
313: return this .flg;
314: }
315:
316: /**
317: * @return Returns the os.
318: */
319: public int getOs() {
320: return this .os;
321: }
322:
323: /**
324: * @return Returns the xfl.
325: */
326: public int getXfl() {
327: return this .xfl;
328: }
329:
330: /**
331: * @return Returns the mtime.
332: */
333: public int getMtime() {
334: return this .mtime;
335: }
336:
337: /**
338: * @return Returns the length.
339: */
340: public int getLength() {
341: return length;
342: }
343: }
|