001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.solr.search;
017:
018: import org.apache.solr.util.BitUtil;
019:
020: /**
021: * <code>HashDocSet</code> represents an unordered set of Lucene Document Ids
022: * using a primitive int hash table. It can be a better choice if there are few docs
023: * in the set because it takes up less memory and is faster to iterate and take
024: * set intersections.
025: *
026: * @author yonik
027: * @version $Id: HashDocSet.java 498246 2007-01-21 05:46:31Z yonik $
028: * @since solr 0.9
029: */
030: public final class HashDocSet extends DocSetBase {
031: /** Default load factor to use for HashDocSets. We keep track of the inverse
032: * since multiplication is so much faster than division. The default
033: * is 1.0f / 0.75f
034: */
035: static float DEFAULT_INVERSE_LOAD_FACTOR = 1.0f / 0.75f;
036:
037: // public final static int MAX_SIZE = SolrConfig.config.getInt("//HashDocSet/@maxSize",-1);
038:
039: // lucene docs are numbered from 0, so a neg number must be used for missing.
040: // an alternative to having to init the array to EMPTY at the start is
041: //
042: private final static int EMPTY = -1;
043: private final int[] table;
044: private final int size;
045:
046: private final int mask;
047:
048: /** Create a HashDocSet from a list of *unique* ids */
049: public HashDocSet(int[] docs, int offset, int len) {
050: this (docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
051: }
052:
053: /** Create a HashDocSet from a list of *unique* ids */
054: public HashDocSet(int[] docs, int offset, int len,
055: float inverseLoadFactor) {
056: int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
057: if (tsize < len * inverseLoadFactor) {
058: tsize <<= 1;
059: }
060:
061: mask = tsize - 1;
062:
063: table = new int[tsize];
064: for (int i = tsize - 1; i >= 0; i--)
065: table[i] = EMPTY;
066:
067: for (int i = offset; i < len; i++) {
068: put(docs[i]);
069: }
070:
071: size = len;
072: }
073:
074: void put(int doc) {
075: int s = doc & mask;
076: while (table[s] != EMPTY) {
077: // Adding an odd number to this power-of-two hash table is
078: // guaranteed to do a full traversal, so instead of re-hashing
079: // we jump straight to a "linear" traversal.
080: // The key is that we provide many different ways to do the
081: // traversal (tablesize/2) based on the last hash code (the doc).
082: // Rely on loop invariant code motion to eval ((doc>>7)|1) only once.
083: // otherwise, we would need to pull the first case out of the loop.
084: s = (s + ((doc >> 7) | 1)) & mask;
085: }
086: table[s] = doc;
087: }
088:
089: public boolean exists(int doc) {
090: int s = doc & mask;
091: for (;;) {
092: int v = table[s];
093: if (v == EMPTY)
094: return false;
095: if (v == doc)
096: return true;
097: // see put() for algorithm details.
098: s = (s + ((doc >> 7) | 1)) & mask;
099: }
100: }
101:
102: public int size() {
103: return size;
104: }
105:
106: public DocIterator iterator() {
107: return new DocIterator() {
108: int pos = 0;
109: int doc;
110: {
111: goNext();
112: }
113:
114: public boolean hasNext() {
115: return pos < table.length;
116: }
117:
118: public Integer next() {
119: return nextDoc();
120: }
121:
122: public void remove() {
123: }
124:
125: void goNext() {
126: while (pos < table.length && table[pos] == EMPTY)
127: pos++;
128: }
129:
130: // modify to return -1 at end of iteration?
131: public int nextDoc() {
132: int doc = table[pos];
133: pos++;
134: goNext();
135: return doc;
136: }
137:
138: public float score() {
139: return 0.0f;
140: }
141: };
142: }
143:
144: public long memSize() {
145: return (table.length << 2) + 20;
146: }
147:
148: @Override
149: public DocSet intersection(DocSet other) {
150: if (other instanceof HashDocSet) {
151: // set "a" to the smallest doc set for the most efficient
152: // intersection.
153: final HashDocSet a = size() <= other.size() ? this
154: : (HashDocSet) other;
155: final HashDocSet b = size() <= other.size() ? (HashDocSet) other
156: : this ;
157:
158: int[] result = new int[a.size()];
159: int resultCount = 0;
160: for (int i = 0; i < a.table.length; i++) {
161: int id = a.table[i];
162: if (id >= 0 && b.exists(id)) {
163: result[resultCount++] = id;
164: }
165: }
166: return new HashDocSet(result, 0, resultCount);
167:
168: } else {
169:
170: int[] result = new int[size()];
171: int resultCount = 0;
172: for (int i = 0; i < table.length; i++) {
173: int id = table[i];
174: if (id >= 0 && other.exists(id)) {
175: result[resultCount++] = id;
176: }
177: }
178: return new HashDocSet(result, 0, resultCount);
179: }
180:
181: }
182:
183: @Override
184: public int intersectionSize(DocSet other) {
185: if (other instanceof HashDocSet) {
186: // set "a" to the smallest doc set for the most efficient
187: // intersection.
188: final HashDocSet a = size() <= other.size() ? this
189: : (HashDocSet) other;
190: final HashDocSet b = size() <= other.size() ? (HashDocSet) other
191: : this ;
192:
193: int resultCount = 0;
194: for (int i = 0; i < a.table.length; i++) {
195: int id = a.table[i];
196: if (id >= 0 && b.exists(id)) {
197: resultCount++;
198: }
199: }
200: return resultCount;
201: } else {
202: int resultCount = 0;
203: for (int i = 0; i < table.length; i++) {
204: int id = table[i];
205: if (id >= 0 && other.exists(id)) {
206: resultCount++;
207: }
208: }
209: return resultCount;
210: }
211:
212: }
213:
214: @Override
215: public DocSet andNot(DocSet other) {
216: int[] result = new int[size()];
217: int resultCount = 0;
218:
219: for (int i = 0; i < table.length; i++) {
220: int id = table[i];
221: if (id >= 0 && !other.exists(id)) {
222: result[resultCount++] = id;
223: }
224: }
225: return new HashDocSet(result, 0, resultCount);
226: }
227:
228: @Override
229: public DocSet union(DocSet other) {
230: if (other instanceof HashDocSet) {
231: // set "a" to the smallest doc set
232: final HashDocSet a = size() <= other.size() ? this
233: : (HashDocSet) other;
234: final HashDocSet b = size() <= other.size() ? (HashDocSet) other
235: : this ;
236:
237: int[] result = new int[a.size() + b.size()];
238: int resultCount = 0;
239: // iterate over the largest table first, adding w/o checking.
240: for (int i = 0; i < b.table.length; i++) {
241: int id = b.table[i];
242: if (id >= 0)
243: result[resultCount++] = id;
244: }
245:
246: // now iterate over smaller set, adding all not already in larger set.
247: for (int i = 0; i < a.table.length; i++) {
248: int id = a.table[i];
249: if (id >= 0 && !b.exists(id))
250: result[resultCount++] = id;
251: }
252:
253: return new HashDocSet(result, 0, resultCount);
254: } else {
255: return other.union(this );
256: }
257: }
258:
259: // don't implement andNotSize() and unionSize() on purpose... they are implemented
260: // in BaseDocSet in terms of intersectionSize().
261: }
|