001: package org.apache.lucene.util;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021:
022: import org.apache.lucene.store.Directory;
023: import org.apache.lucene.store.IndexInput;
024: import org.apache.lucene.store.IndexOutput;
025:
026: /** Optimized implementation of a vector of bits. This is more-or-less like
027: java.util.BitSet, but also includes the following:
028: <ul>
029: <li>a count() method, which efficiently computes the number of one bits;</li>
030: <li>optimized read from and write to disk;</li>
031: <li>inlinable get() method;</li>
032: <li>store and load, as bit set or d-gaps, depending on sparseness;</li>
033: </ul>
034:
035:
036: @version $Id: BitVector.java 564236 2007-08-09 15:21:19Z gsingers $
037: */
038: public final class BitVector {
039:
040: private byte[] bits;
041: private int size;
042: private int count = -1;
043:
044: /** Constructs a vector capable of holding <code>n</code> bits. */
045: public BitVector(int n) {
046: size = n;
047: bits = new byte[(size >> 3) + 1];
048: }
049:
050: /** Sets the value of <code>bit</code> to one. */
051: public final void set(int bit) {
052: if (bit >= size) {
053: throw new ArrayIndexOutOfBoundsException(bit);
054: }
055: bits[bit >> 3] |= 1 << (bit & 7);
056: count = -1;
057: }
058:
059: /** Sets the value of <code>bit</code> to zero. */
060: public final void clear(int bit) {
061: if (bit >= size) {
062: throw new ArrayIndexOutOfBoundsException(bit);
063: }
064: bits[bit >> 3] &= ~(1 << (bit & 7));
065: count = -1;
066: }
067:
068: /** Returns <code>true</code> if <code>bit</code> is one and
069: <code>false</code> if it is zero. */
070: public final boolean get(int bit) {
071: if (bit >= size) {
072: throw new ArrayIndexOutOfBoundsException(bit);
073: }
074: return (bits[bit >> 3] & (1 << (bit & 7))) != 0;
075: }
076:
077: /** Returns the number of bits in this vector. This is also one greater than
078: the number of the largest valid bit number. */
079: public final int size() {
080: return size;
081: }
082:
083: /** Returns the total number of one bits in this vector. This is efficiently
084: computed and cached, so that, if the vector is not changed, no
085: recomputation is done for repeated calls. */
086: public final int count() {
087: // if the vector has been modified
088: if (count == -1) {
089: int c = 0;
090: int end = bits.length;
091: for (int i = 0; i < end; i++)
092: c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
093: count = c;
094: }
095: return count;
096: }
097:
098: private static final byte[] BYTE_COUNTS = { // table of bits/byte
099: 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3,
100: 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3,
101: 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5,
102: 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3,
103: 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4,
104: 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5,
105: 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4,
106: 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3,
107: 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5,
108: 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4,
109: 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6,
110: 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5,
111: 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 };
112:
113: /** Writes this vector to the file <code>name</code> in Directory
114: <code>d</code>, in a format that can be read by the constructor {@link
115: #BitVector(Directory, String)}. */
116: public final void write(Directory d, String name)
117: throws IOException {
118: IndexOutput output = d.createOutput(name);
119: try {
120: if (isSparse()) {
121: writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
122: } else {
123: writeBits(output);
124: }
125: } finally {
126: output.close();
127: }
128: }
129:
130: /** Write as a bit set */
131: private void writeBits(IndexOutput output) throws IOException {
132: output.writeInt(size()); // write size
133: output.writeInt(count()); // write count
134: output.writeBytes(bits, bits.length);
135: }
136:
137: /** Write as a d-gaps list */
138: private void writeDgaps(IndexOutput output) throws IOException {
139: output.writeInt(-1); // mark using d-gaps
140: output.writeInt(size()); // write size
141: output.writeInt(count()); // write count
142: int last = 0;
143: int n = count();
144: int m = bits.length;
145: for (int i = 0; i < m && n > 0; i++) {
146: if (bits[i] != 0) {
147: output.writeVInt(i - last);
148: output.writeByte(bits[i]);
149: last = i;
150: n -= BYTE_COUNTS[bits[i] & 0xFF];
151: }
152: }
153: }
154:
155: /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
156: private boolean isSparse() {
157: // note: order of comparisons below set to favor smaller values (no binary range search.)
158: // note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
159: // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
160: // multiplying count by (8+8) or (8+16) or (8+24) etc.:
161: // - first 8 for writing bits[i] (1 byte vs. 1 bit), and
162: // - second part for writing the byte-number d-gap as vint.
163: // note: factor is for read/write of byte-arrays being faster than vints.
164: int factor = 10;
165: if (bits.length < (1 << 7))
166: return factor * (4 + (8 + 8) * count()) < size();
167: if (bits.length < (1 << 14))
168: return factor * (4 + (8 + 16) * count()) < size();
169: if (bits.length < (1 << 21))
170: return factor * (4 + (8 + 24) * count()) < size();
171: if (bits.length < (1 << 28))
172: return factor * (4 + (8 + 32) * count()) < size();
173: return factor * (4 + (8 + 40) * count()) < size();
174: }
175:
176: /** Constructs a bit vector from the file <code>name</code> in Directory
177: <code>d</code>, as written by the {@link #write} method.
178: */
179: public BitVector(Directory d, String name) throws IOException {
180: IndexInput input = d.openInput(name);
181: try {
182: size = input.readInt(); // read size
183: if (size == -1) {
184: readDgaps(input);
185: } else {
186: readBits(input);
187: }
188: } finally {
189: input.close();
190: }
191: }
192:
193: /** Read as a bit set */
194: private void readBits(IndexInput input) throws IOException {
195: count = input.readInt(); // read count
196: bits = new byte[(size >> 3) + 1]; // allocate bits
197: input.readBytes(bits, 0, bits.length);
198: }
199:
200: /** read as a d-gaps list */
201: private void readDgaps(IndexInput input) throws IOException {
202: size = input.readInt(); // (re)read size
203: count = input.readInt(); // read count
204: bits = new byte[(size >> 3) + 1]; // allocate bits
205: int last = 0;
206: int n = count();
207: while (n > 0) {
208: last += input.readVInt();
209: bits[last] = input.readByte();
210: n -= BYTE_COUNTS[bits[last] & 0xFF];
211: }
212: }
213:
214: }
|