001: /* DefaultBlockFileSystem
002: *
003: * Created on September 12, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.util.ms;
024:
025: import java.io.IOException;
026: import java.nio.ByteBuffer;
027: import java.nio.ByteOrder;
028: import java.util.Map;
029:
030: import org.archive.io.SeekInputStream;
031: import org.archive.util.IoUtils;
032: import org.archive.util.LRU;
033:
034: /**
035: * Default implementation of the Block File System.
036: *
037: * <p>The overall structure of a BlockFileSystem file (such as a .doc file) is
038: * as follows. The file is divided into blocks, which are of uniform length
039: * (512 bytes). The first block (at file pointer 0) is called the header
040: * block. It's used to look up other blocks in the file.
041: *
042: * <p>Subfiles contained within the .doc file are organized using a Block
043: * Allocation Table, or BAT. The BAT is basically a linked list; given a
044: * block number, the BAT will tell you the next block number. Note that
045: * the header block has no number; block #0 is the first block after the
046: * header. Thus, to convert a block number to a file pointer:
047: * <code>int filePointer = (blockNumber + 1) * BLOCK_SIZE</code>.
048: *
049: * <p>The BAT itself is discontinuous, however. To find the blocks that
050: * comprise the BAT, you have to look in the header block. The header block
051: * contains an array of 109 pointers to the blocks that comprise the BAT.
052: * If more than 109 BAT blocks are required (in other words, if the .doc
053: * file is larger than ~6 megabytes), then something called the
054: * XBAT comes into play.
055: *
056: * <p>XBAT blocks contain pointers to the 110th BAT block and beyond.
057: * The first XBAT block is stored at a file pointer listed in the header.
058: * The other XBAT blocks are always stored in order after the first; the
059: * XBAT table is continuous. One is inclined to wonder why the BAT itself
060: * is not so stored, but oh well.
061: *
062: * <p>The BAT only tells you the next block for a given block. To find the
063: * first block for a subfile, you have to look up that subfile's directory
064: * entry. Each directory entry is a 128 byte structure in the file, so four
065: * of them fit in a block. The number of the first block of the entry list
066: * is stored in the header. To find subsequent entry blocks, the BAT must
067: * be used.
068: *
069: * <p>I'm telling you all this so that you understand the caching that this
070: * class provides.
071: *
072: * <p>First, directory entries are not cached. It's assumed that they will
073: * be looked up at the beginning of a lengthy operation, and then forgotten
074: * about. This is certainly the case for {@link Doc#getText(BlockFileSystem)}.
075: * If you need to remember directory entries, you can manually store the Entry
076: * objects in a map or something, as they don't grow stale.
077: *
078: * <p>This class keeps all 512 bytes of the header block in memory at all
079: * times. This prevents a potentially expensive file pointer repositioning
080: * every time you're trying to figure out what comes next.
081: *
082: * <p>BAT and XBAT blocks are stored in a least-recently used cache. The
083: * <i>n</i> most recent BAT and XBAT blocks are remembered, where <i>n</i>
084: * is set at construction time. The minimum value of <i>n</i> is 1. For small
085: * files, this can prevent file pointer repositioning for BAT look ups.
086: *
087: * <p>The BAT/XBAT cache only takes up memory as needed. If the specified
088: * cache size is 100 blocks, but the file only has 4 BAT blocks, then only
089: * 2048 bytes will be used by the cache.
090: *
091: * <p>Note this class only caches BAT and XBAT blocks. It does not cache the
092: * blocks that actually make up a subfile's contents. It is assumed that those
093: * blocks will only be accessed once per operation (again, this is what
094: * {Doc.getText(BlockFileSystem)} typically requires.)
095: *
096: * @author pjack
097: * @see http://jakarta.apache.org/poi/poifs/fileformat.html
098: */
099: public class DefaultBlockFileSystem implements BlockFileSystem {
100:
101: /**
102: * Pointers per BAT block.
103: */
104: final private static int POINTERS_PER_BAT = 128;
105:
106: /**
107: * Size of a BAT pointer in bytes. (In other words, 4).
108: */
109: final private static int BAT_POINTER_SIZE = BLOCK_SIZE
110: / POINTERS_PER_BAT;
111:
112: /**
113: * The number of BAT pointers in the header block. After this many
114: * BAT blocks, the XBAT blocks must be consulted.
115: */
116: final private static int HEADER_BAT_LIMIT = 109;
117:
118: /**
119: * The size of an entry record in bytes.
120: */
121: final private static int ENTRY_SIZE = 128;
122:
123: /**
124: * The number of entries that can fit in a block.
125: */
126: final private static int ENTRIES_PER_BLOCK = BLOCK_SIZE
127: / ENTRY_SIZE;
128:
129: /**
130: * The .doc file as a stream.
131: */
132: private SeekInputStream input;
133:
134: /**
135: * The header block.
136: */
137: private HeaderBlock header;
138:
139: /**
140: * Cache of BAT and XBAT blocks.
141: */
142: private Map<Integer, ByteBuffer> cache;
143:
144: /**
145: * Constructor.
146: *
147: * @param input the file to read from
148: * @param batCacheSize number of BAT and XBAT blocks to cache
149: * @throws IOException if an IO error occurs
150: */
151: public DefaultBlockFileSystem(SeekInputStream input,
152: int batCacheSize) throws IOException {
153: this .input = input;
154: byte[] temp = new byte[BLOCK_SIZE];
155: IoUtils.readFully(input, temp);
156: this .header = new HeaderBlock(ByteBuffer.wrap(temp));
157: this .cache = new LRU<Integer, ByteBuffer>(batCacheSize);
158: }
159:
160: public Entry getRoot() throws IOException {
161: // Position to the first block of the entry list.
162: int block = header.getEntriesStart();
163: input.position((block + 1) * BLOCK_SIZE);
164:
165: // The root entry is always entry #0.
166: return new DefaultEntry(this , input, 0);
167: }
168:
169: /**
170: * Returns the entry with the given number.
171: *
172: * @param entryNumber the number of the entry to return
173: * @return that entry, or null if no such entry exists
174: * @throws IOException if an IO error occurs
175: */
176: Entry getEntry(int entryNumber) throws IOException {
177: // Entry numbers < 0 typically indicate an end-of-stream.
178: if (entryNumber < 0) {
179: return null;
180: }
181:
182: // It's impossible to check against the upper bound, because the
183: // upper bound is not recorded anywhere.
184:
185: // Advance to the block containing the desired entry.
186: int blockCount = entryNumber / ENTRIES_PER_BLOCK;
187: int remainder = entryNumber % ENTRIES_PER_BLOCK;
188: int block = header.getEntriesStart();
189: for (int i = 0; i < blockCount; i++) {
190: block = getNextBlock(block);
191: }
192:
193: if (block < 0) {
194: // Given entry number exceeded the number of available entries.
195: return null;
196: }
197:
198: int filePos = (block + 1) * BLOCK_SIZE + remainder * ENTRY_SIZE;
199: input.position(filePos);
200:
201: return new DefaultEntry(this , input, entryNumber);
202: }
203:
204: public int getNextBlock(int block) throws IOException {
205: if (block < 0) {
206: return block;
207: }
208:
209: // Index into the header array of BAT blocks.
210: int headerIndex = block / POINTERS_PER_BAT;
211:
212: // Index within that BAT block of the block we're interested in.
213: int batBlockIndex = block % POINTERS_PER_BAT;
214:
215: int batBlockNumber = batLookup(headerIndex);
216: ByteBuffer batBlock = getBATBlock(batBlockNumber);
217: return batBlock.getInt(batBlockIndex * BAT_POINTER_SIZE);
218: }
219:
220: /**
221: * Looks up the block number of a BAT block.
222: *
223: * @param headerIndex
224: * @return
225: * @throws IOException
226: */
227: private int batLookup(int headerIndex) throws IOException {
228: if (headerIndex < HEADER_BAT_LIMIT + 1) {
229: return header.getBATBlockNumber(headerIndex);
230: }
231:
232: // Find the XBAT block of interest
233: headerIndex -= HEADER_BAT_LIMIT;
234: int xbatBlockNumber = headerIndex / POINTERS_PER_BAT;
235: xbatBlockNumber += header.getExtendedBATStart();
236: ByteBuffer xbat = getBATBlock(xbatBlockNumber);
237:
238: // Find the bat Block number inside the XBAT block
239: int xbatBlockIndex = headerIndex % POINTERS_PER_BAT;
240: return xbat.getInt(xbatBlockIndex * BAT_POINTER_SIZE);
241: }
242:
243: /**
244: * Returns the BAT block with the given block number.
245: * If the BAT block were previously cached, then the cached version
246: * is returned. Otherwise, the file pointer is repoisitioned to
247: * the start of the given block, and the 512 bytes are read and
248: * stored in the cache.
249: *
250: * @param block the block number of the BAT block to return
251: * @return the BAT block
252: * @throws IOException
253: */
254: private ByteBuffer getBATBlock(int block) throws IOException {
255: ByteBuffer r = cache.get(block);
256: if (r != null) {
257: return r;
258: }
259:
260: byte[] buf = new byte[BLOCK_SIZE];
261: input.position((block + 1) * BLOCK_SIZE);
262: IoUtils.readFully(input, buf);
263:
264: r = ByteBuffer.wrap(buf);
265: r.order(ByteOrder.LITTLE_ENDIAN);
266: cache.put(block, r);
267: return r;
268: }
269:
270: public SeekInputStream getRawInput() {
271: return input;
272: }
273: }
|