001: /*
002: * Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2007.
003: *
004: * Licensed under the Aduna BSD-style license.
005: */
006: package org.openrdf.sail.nativerdf.datastore;
007:
008: import java.io.File;
009: import java.io.IOException;
010: import java.io.PrintStream;
011: import java.io.RandomAccessFile;
012: import java.nio.ByteBuffer;
013: import java.nio.channels.FileChannel;
014: import java.util.Arrays;
015:
016: /**
017: * Class supplying access to a hash file.
018: *
019: * @author Arjohn Kampman
020: */
021: public class HashFile {
022:
023: /*-----------*
024: * Constants *
025: *-----------*/
026:
027: // The size of an item (32-bit hash + 32-bit ID), in bytes
028: private static final int ITEM_SIZE = 8;
029:
030: /**
031: * Magic number "Native Hash File" to detect whether the file is actually a
032: * hash file. The first three bytes of the file should be equal to this magic
033: * number.
034: */
035: private static final byte[] MAGIC_NUMBER = new byte[] { 'n', 'h',
036: 'f' };
037:
038: /**
039: * File format version, stored as the fourth byte in hash files.
040: */
041: private static final byte FILE_FORMAT_VERSION = 1;
042:
043: /**
044: * The size of the file header in bytes. The file header contains the
045: * following data: magic number (3 bytes) file format version (1 byte),
046: * number of buckets (4 bytes), bucket size (4 bytes) and number of stored
047: * items (4 bytes).
048: */
049: private static final long HEADER_LENGTH = 16;
050:
051: private static final int INIT_BUCKET_COUNT = 64;
052:
053: private static final int INIT_BUCKET_SIZE = 8;
054:
055: /*-----------*
056: * Variables *
057: *-----------*/
058:
059: private File file;
060:
061: private RandomAccessFile raf;
062:
063: private FileChannel fileChannel;
064:
065: private boolean forceSync;
066:
067: // The number of (non-overflow) buckets in the hash file
068: private int bucketCount;
069:
070: // The number of items that can be stored in a bucket
071: private int bucketSize;
072:
073: // The number of items in the hash file
074: private int itemCount;
075:
076: // Load factor (fixed, for now)
077: private float loadFactor = 0.75f;
078:
079: // recordSize = ITEM_SIZE * bucketSize + 4
080: private int recordSize;
081:
082: /*--------------*
083: * Constructors *
084: *--------------*/
085:
086: public HashFile(File file) throws IOException {
087: this (file, false);
088: }
089:
090: public HashFile(File file, boolean forceSync) throws IOException {
091: this .file = file;
092: this .forceSync = forceSync;
093:
094: if (!file.exists()) {
095: boolean created = file.createNewFile();
096: if (!created) {
097: throw new IOException("Failed to create file: " + file);
098: }
099: }
100:
101: // Open a read/write channel to the file
102: raf = new RandomAccessFile(file, "rw");
103: fileChannel = raf.getChannel();
104:
105: if (fileChannel.size() == 0L) {
106: // Empty file, insert bucket count, bucket size
107: // and item count at the start of the file
108: bucketCount = INIT_BUCKET_COUNT;
109: bucketSize = INIT_BUCKET_SIZE;
110: itemCount = 0;
111: recordSize = ITEM_SIZE * bucketSize + 4;
112:
113: // Initialize the file by writing <_bucketCount> empty buckets
114: writeEmptyBuckets(HEADER_LENGTH, bucketCount);
115:
116: sync();
117: } else {
118: // Read bucket count, bucket size and item count from the file
119: readFileHeader();
120:
121: recordSize = ITEM_SIZE * bucketSize + 4;
122: }
123: }
124:
125: /*---------*
126: * Methods *
127: *---------*/
128:
129: public File getFile() {
130: return file;
131: }
132:
133: public FileChannel getFileChannel() {
134: return fileChannel;
135: }
136:
137: public int getBucketCount() {
138: return bucketCount;
139: }
140:
141: public int getBucketSize() {
142: return bucketSize;
143: }
144:
145: public int getItemCount() {
146: return itemCount;
147: }
148:
149: public int getRecordSize() {
150: return recordSize;
151: }
152:
153: /**
154: * Gets an iterator that iterates over the IDs with hash codes that match the
155: * specified hash code.
156: */
157: public IDIterator getIDIterator(int hash) throws IOException {
158: return new IDIterator(hash);
159: }
160:
161: /**
162: * Stores ID under the specified hash code in this hash file.
163: */
164: public void storeID(int hash, int id) throws IOException {
165: // Calculate bucket offset for initial bucket
166: long bucketOffset = getBucketOffset(hash);
167:
168: storeID(bucketOffset, hash, id);
169:
170: itemCount++;
171:
172: if (itemCount >= loadFactor * bucketCount * bucketSize) {
173: increaseHashTable();
174: }
175: }
176:
177: private void storeID(long bucketOffset, int hash, int id)
178: throws IOException {
179: boolean idStored = false;
180: ByteBuffer bucket = ByteBuffer.allocate(recordSize);
181:
182: while (!idStored) {
183: fileChannel.read(bucket, bucketOffset);
184:
185: // Find first empty slot in bucket
186: int slotID = findEmptySlotInBucket(bucket);
187:
188: if (slotID >= 0) {
189: // Empty slot found, store dataOffset in it
190: bucket.putInt(ITEM_SIZE * slotID, hash);
191: bucket.putInt(ITEM_SIZE * slotID + 4, id);
192: bucket.rewind();
193: fileChannel.write(bucket, bucketOffset);
194: idStored = true;
195: } else {
196: // No empty slot found, check if bucket has an overflow bucket
197: int overflowID = bucket.getInt(ITEM_SIZE * bucketSize);
198:
199: if (overflowID == 0) {
200: // No overflow bucket yet, create one
201: overflowID = createOverflowBucket();
202:
203: // Link overflow bucket to current bucket
204: bucket.putInt(ITEM_SIZE * bucketSize, overflowID);
205: bucket.rewind();
206: fileChannel.write(bucket, bucketOffset);
207: }
208:
209: // Continue searching for an empty slot in the overflow bucket
210: bucketOffset = getOverflowBucketOffset(overflowID);
211: bucket.clear();
212: }
213: }
214: }
215:
216: public void clear() throws IOException {
217: // Truncate the file to remove any overflow buffers
218: fileChannel.truncate(HEADER_LENGTH + (long) bucketCount
219: * recordSize);
220:
221: // Overwrite normal buckets with empty ones
222: writeEmptyBuckets(HEADER_LENGTH, bucketCount);
223:
224: itemCount = 0;
225: }
226:
227: /**
228: * Syncs any unstored data to the hash file.
229: */
230: public void sync() throws IOException {
231: // Update the file header
232: writeFileHeader();
233:
234: if (forceSync) {
235: fileChannel.force(false);
236: }
237: }
238:
239: public void close() throws IOException {
240: raf.close();
241: raf = null;
242: fileChannel = null;
243: }
244:
245: /*-----------------*
246: * Utility methods *
247: *-----------------*/
248:
249: private RandomAccessFile createEmptyFile(File file)
250: throws IOException {
251: // Make sure the file exists
252: if (!file.exists()) {
253: boolean created = file.createNewFile();
254: if (!created) {
255: throw new IOException("Failed to create file " + file);
256: }
257: }
258:
259: // Open the file in read-write mode and make sure the file is empty
260: RandomAccessFile raf = new RandomAccessFile(file, "rw");
261: raf.setLength(0L);
262:
263: return raf;
264: }
265:
266: /**
267: * Writes the bucket count, bucket size and item count to the file header.
268: */
269: private void writeFileHeader() throws IOException {
270: ByteBuffer buf = ByteBuffer.allocate((int) HEADER_LENGTH);
271: buf.put(MAGIC_NUMBER);
272: buf.put(FILE_FORMAT_VERSION);
273: buf.putInt(bucketCount);
274: buf.putInt(bucketSize);
275: buf.putInt(itemCount);
276: buf.rewind();
277:
278: fileChannel.write(buf, 0L);
279: }
280:
281: /**
282: * Reads the bucket count, bucket size and item count from the file header.
283: */
284: private void readFileHeader() throws IOException {
285: ByteBuffer buf = ByteBuffer.allocate((int) HEADER_LENGTH);
286: fileChannel.read(buf, 0L);
287: buf.rewind();
288:
289: if (buf.remaining() < HEADER_LENGTH) {
290: throw new IOException(
291: "File too short to be a compatible hash file");
292: }
293:
294: byte[] magicNumber = new byte[MAGIC_NUMBER.length];
295: buf.get(magicNumber);
296: byte version = buf.get();
297: bucketCount = buf.getInt();
298: bucketSize = buf.getInt();
299: itemCount = buf.getInt();
300:
301: if (!Arrays.equals(MAGIC_NUMBER, magicNumber)) {
302: throw new IOException(
303: "File doesn't contain compatible hash file data");
304: }
305:
306: if (version > FILE_FORMAT_VERSION) {
307: throw new IOException(
308: "Unable to read hash file; it uses a newer file format");
309: } else if (version != FILE_FORMAT_VERSION) {
310: throw new IOException(
311: "Unable to read hash file; invalid file format version: "
312: + version);
313: }
314: }
315:
316: /**
317: * Returns the offset of the bucket for the specified hash code.
318: */
319: private long getBucketOffset(int hash) {
320: int bucketNo = hash % bucketCount;
321: if (bucketNo < 0) {
322: bucketNo += bucketCount;
323: }
324: return HEADER_LENGTH + (long) bucketNo * recordSize;
325: }
326:
327: /**
328: * Returns the offset of the overflow bucket with the specified ID.
329: */
330: private long getOverflowBucketOffset(int bucketID) {
331: return HEADER_LENGTH
332: + ((long) bucketCount + (long) bucketID - 1L)
333: * recordSize;
334: }
335:
336: /**
337: * Creates a new overflow bucket and returns its ID.
338: */
339: private int createOverflowBucket() throws IOException {
340: long offset = fileChannel.size();
341: writeEmptyBuckets(offset, 1);
342: return (int) ((offset - HEADER_LENGTH) / recordSize)
343: - bucketCount + 1;
344: }
345:
346: private void writeEmptyBuckets(long fileOffset, int bucketCount)
347: throws IOException {
348: ByteBuffer emptyBucket = ByteBuffer.allocate(recordSize);
349:
350: for (int i = 0; i < bucketCount; i++) {
351: fileChannel.write(emptyBucket, fileOffset + i
352: * (long) recordSize);
353: emptyBucket.rewind();
354: }
355: }
356:
357: private int findEmptySlotInBucket(ByteBuffer bucket) {
358: for (int slotNo = 0; slotNo < bucketSize; slotNo++) {
359: // Check for offsets that are equal to 0
360: if (bucket.getInt(ITEM_SIZE * slotNo + 4) == 0) {
361: return slotNo;
362: }
363: }
364:
365: return -1;
366: }
367:
368: /**
369: * Double the number of buckets in the hash file and rehashes the stored
370: * items.
371: */
372: private void increaseHashTable() throws IOException {
373: // System.out.println("Increasing hash table to " + (2*_bucketCount) + "
374: // buckets...");
375: // long startTime = System.currentTimeMillis();
376:
377: long oldTableSize = HEADER_LENGTH + (long) bucketCount
378: * recordSize;
379: long newTableSize = HEADER_LENGTH + (long) bucketCount
380: * recordSize * 2;
381: long oldFileSize = fileChannel.size(); // includes overflow buckets
382:
383: // Move any overflow buckets out of the way to a temporary file
384: File tmpFile = new File(file.getParentFile(), "rehash_"
385: + file.getName());
386: RandomAccessFile tmpRaf = createEmptyFile(tmpFile);
387: FileChannel tmpChannel = tmpRaf.getChannel();
388:
389: // Transfer the overflow buckets to the temp file
390: fileChannel.transferTo(oldTableSize, oldFileSize, tmpChannel);
391:
392: // Increase hash table by factor 2
393: writeEmptyBuckets(oldTableSize, bucketCount);
394: bucketCount *= 2;
395:
396: // Discard any remaining overflow buffers
397: fileChannel.truncate(newTableSize);
398:
399: ByteBuffer bucket = ByteBuffer.allocate(recordSize);
400: ByteBuffer newBucket = ByteBuffer.allocate(recordSize);
401:
402: // Rehash items in 'normal' buckets, half of these will move to a new
403: // location, but none of them will trigger the creation of new overflow
404: // buckets. Any (now deprecated) references to overflow buckets are
405: // removed too.
406:
407: // All items that are moved to a new location end up in one and the same
408: // new and empty bucket. All items are divided between the old and the new
409: // bucket and the changes to the buckets are written to disk only once.
410: for (long bucketOffset = HEADER_LENGTH; bucketOffset < oldTableSize; bucketOffset += recordSize) {
411: fileChannel.read(bucket, bucketOffset);
412:
413: boolean bucketChanged = false;
414: long newBucketOffset = 0L;
415:
416: for (int slotNo = 0; slotNo < bucketSize; slotNo++) {
417: int id = bucket.getInt(ITEM_SIZE * slotNo + 4);
418:
419: if (id != 0) {
420: // Slot is not empty
421: int hash = bucket.getInt(ITEM_SIZE * slotNo);
422: long newOffset = getBucketOffset(hash);
423:
424: if (newOffset != bucketOffset) {
425: // Move this item to new bucket...
426: newBucket.putInt(hash);
427: newBucket.putInt(id);
428:
429: // ...and remove it from the current bucket
430: bucket.putInt(ITEM_SIZE * slotNo, 0);
431: bucket.putInt(ITEM_SIZE * slotNo + 4, 0);
432:
433: bucketChanged = true;
434: newBucketOffset = newOffset;
435: }
436: }
437: }
438:
439: if (bucketChanged) {
440: // Some of the items were moved to the new bucket, write it to the
441: // file
442: newBucket.flip();
443: fileChannel.write(newBucket, newBucketOffset);
444: newBucket.clear();
445: }
446:
447: // Reset overflow ID in the old bucket to 0 if necessary
448: if (bucket.getInt(ITEM_SIZE * bucketSize) != 0) {
449: bucket.putInt(ITEM_SIZE * bucketSize, 0);
450: bucketChanged = true;
451: }
452:
453: if (bucketChanged) {
454: // Some of the items were moved to the new bucket or the overflow
455: // ID has been reset; write the bucket back to the file
456: bucket.rewind();
457: fileChannel.write(bucket, bucketOffset);
458: }
459:
460: bucket.clear();
461: }
462:
463: // Rehash items in overflow buckets. This might trigger the creation of
464: // new overflow buckets so we can't optimize this in the same way as we
465: // rehash the normal buckets.
466: long tmpFileSize = tmpChannel.size();
467: for (long bucketOffset = 0L; bucketOffset < tmpFileSize; bucketOffset += recordSize) {
468: tmpChannel.read(bucket, bucketOffset);
469:
470: for (int slotNo = 0; slotNo < bucketSize; slotNo++) {
471: int id = bucket.getInt(ITEM_SIZE * slotNo + 4);
472:
473: if (id != 0) {
474: // Slot is not empty
475: int hash = bucket.getInt(ITEM_SIZE * slotNo);
476: long newBucketOffset = getBucketOffset(hash);
477:
478: // Move this item to new location...
479: storeID(newBucketOffset, hash, id);
480:
481: // ...and remove it from the current bucket
482: bucket.putInt(ITEM_SIZE * slotNo, 0);
483: bucket.putInt(ITEM_SIZE * slotNo + 4, 0);
484: }
485: }
486:
487: bucket.clear();
488: }
489:
490: // Discard the temp file
491: tmpRaf.close();
492: tmpFile.delete();
493:
494: // long endTime = System.currentTimeMillis();
495: // System.out.println("Hash table rehashed in " + (endTime-startTime) + "
496: // ms");
497: }
498:
499: public void dumpContents(PrintStream out) throws IOException {
500: out.println();
501: out.println("*** hash file contents ***");
502:
503: out.println("_bucketCount=" + bucketCount);
504: out.println("_bucketSize=" + bucketSize);
505: out.println("_itemCount=" + itemCount);
506:
507: ByteBuffer buf = ByteBuffer.allocate(recordSize);
508: fileChannel.position(HEADER_LENGTH);
509:
510: out.println("---Buckets---");
511:
512: for (int bucketNo = 1; bucketNo <= bucketCount; bucketNo++) {
513: buf.clear();
514: fileChannel.read(buf);
515:
516: out.print("Bucket " + bucketNo + ": ");
517:
518: for (int slotNo = 0; slotNo < bucketSize; slotNo++) {
519: int hash = buf.getInt(ITEM_SIZE * slotNo);
520: int id = buf.getInt(ITEM_SIZE * slotNo + 4);
521: if (slotNo > 0) {
522: out.print(" ");
523: }
524: out.print("[" + toHexString(hash) + "," + id + "]");
525: }
526:
527: int overflowID = buf.getInt(ITEM_SIZE * bucketSize);
528: out.println("---> " + overflowID);
529: }
530:
531: out.println("---Overflow Buckets---");
532:
533: int bucketNo = 0;
534: while (fileChannel.position() < fileChannel.size()) {
535: buf.clear();
536: fileChannel.read(buf);
537: bucketNo++;
538:
539: out.print("Bucket " + bucketNo + ": ");
540:
541: for (int slotNo = 0; slotNo < bucketSize; slotNo++) {
542: int hash = buf.getInt(ITEM_SIZE * slotNo);
543: int id = buf.getInt(ITEM_SIZE * slotNo + 4);
544: if (slotNo > 0) {
545: out.print(" ");
546: }
547: out.print("[" + toHexString(hash) + "," + id + "]");
548: }
549:
550: int overflowID = buf.getInt(ITEM_SIZE * bucketSize);
551: out.println("---> " + overflowID);
552: }
553:
554: out.println("*** end of hash file contents ***");
555: out.println();
556: }
557:
558: private String toHexString(int decimal) {
559: String hex = Integer.toHexString(decimal);
560:
561: StringBuilder result = new StringBuilder(8);
562: for (int i = hex.length(); i < 8; i++) {
563: result.append("0");
564: }
565: result.append(hex);
566:
567: return result.toString();
568: }
569:
570: /*------------------------*
571: * Inner class IDIterator *
572: *------------------------*/
573:
574: public class IDIterator {
575:
576: private int queryHash;
577:
578: private ByteBuffer bucketBuffer;
579:
580: private long bucketOffset;
581:
582: private int slotNo;
583:
584: private IDIterator(int hash) throws IOException {
585: queryHash = hash;
586:
587: bucketBuffer = ByteBuffer.allocate(getRecordSize());
588:
589: // Calculate offset for initial bucket
590: bucketOffset = getBucketOffset(hash);
591:
592: // Read initial bucket
593: getFileChannel().read(bucketBuffer, bucketOffset);
594:
595: slotNo = -1;
596: }
597:
598: /**
599: * Returns the next ID that has been mapped to the specified hash code, or
600: * <tt>-1</tt> if no more IDs were found.
601: */
602: public int next() throws IOException {
603: while (bucketBuffer != null) {
604: // Search through current bucket
605: slotNo++;
606: while (slotNo < getBucketSize()) {
607: if (bucketBuffer.getInt(ITEM_SIZE * slotNo) == queryHash) {
608: return bucketBuffer.getInt(ITEM_SIZE * slotNo
609: + 4);
610: }
611: slotNo++;
612: }
613:
614: // No matching hash code in current bucket, check overflow bucket
615: int overflowID = bucketBuffer.getInt(ITEM_SIZE
616: * getBucketSize());
617: if (overflowID == 0) {
618: // No overflow bucket, end the search
619: bucketBuffer = null;
620: bucketOffset = 0L;
621: } else {
622: // Continue with overflow bucket
623: bucketOffset = getOverflowBucketOffset(overflowID);
624: bucketBuffer.clear();
625: getFileChannel().read(bucketBuffer, bucketOffset);
626: slotNo = -1;
627: }
628: }
629:
630: return -1;
631: }
632: } // End inner class IDIterator
633:
634: public static void main(String[] args) throws Exception {
635: HashFile hashFile = new HashFile(new File(args[0]));
636: hashFile.dumpContents(System.out);
637: hashFile.close();
638: }
639:
640: } // End class HashFile
|