001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021:
022: import org.apache.lucene.store.Directory;
023: import org.apache.lucene.store.BufferedIndexInput;
024:
025: /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
026: * Directory. Pairs are accessed either by Term or by ordinal position the
027: * set. */
028:
029: final class TermInfosReader {
030: private Directory directory;
031: private String segment;
032: private FieldInfos fieldInfos;
033:
034: private ThreadLocal enumerators = new ThreadLocal();
035: private SegmentTermEnum origEnum;
036: private long size;
037:
038: private Term[] indexTerms = null;
039: private TermInfo[] indexInfos;
040: private long[] indexPointers;
041:
042: private SegmentTermEnum indexEnum;
043:
044: private int indexDivisor = 1;
045: private int totalIndexInterval;
046:
047: TermInfosReader(Directory dir, String seg, FieldInfos fis)
048: throws CorruptIndexException, IOException {
049: this (dir, seg, fis, BufferedIndexInput.BUFFER_SIZE);
050: }
051:
052: TermInfosReader(Directory dir, String seg, FieldInfos fis,
053: int readBufferSize) throws CorruptIndexException,
054: IOException {
055: boolean success = false;
056:
057: try {
058: directory = dir;
059: segment = seg;
060: fieldInfos = fis;
061:
062: origEnum = new SegmentTermEnum(directory.openInput(segment
063: + ".tis", readBufferSize), fieldInfos, false);
064: size = origEnum.size;
065: totalIndexInterval = origEnum.indexInterval;
066:
067: indexEnum = new SegmentTermEnum(directory.openInput(segment
068: + ".tii", readBufferSize), fieldInfos, true);
069:
070: success = true;
071: } finally {
072: // With lock-less commits, it's entirely possible (and
073: // fine) to hit a FileNotFound exception above. In
074: // this case, we want to explicitly close any subset
075: // of things that were opened so that we don't have to
076: // wait for a GC to do so.
077: if (!success) {
078: close();
079: }
080: }
081: }
082:
083: public int getSkipInterval() {
084: return origEnum.skipInterval;
085: }
086:
087: public int getMaxSkipLevels() {
088: return origEnum.maxSkipLevels;
089: }
090:
091: /**
092: * <p>Sets the indexDivisor, which subsamples the number
093: * of indexed terms loaded into memory. This has a
094: * similar effect as {@link
095: * IndexWriter#setTermIndexInterval} except that setting
096: * must be done at indexing time while this setting can be
097: * set per reader. When set to N, then one in every
098: * N*termIndexInterval terms in the index is loaded into
099: * memory. By setting this to a value > 1 you can reduce
100: * memory usage, at the expense of higher latency when
101: * loading a TermInfo. The default value is 1.</p>
102: *
103: * <b>NOTE:</b> you must call this before the term
104: * index is loaded. If the index is already loaded,
105: * an IllegalStateException is thrown.
106: *
107: + @throws IllegalStateException if the term index has
108: * already been loaded into memory.
109: */
110: public void setIndexDivisor(int indexDivisor)
111: throws IllegalStateException {
112: if (indexDivisor < 1)
113: throw new IllegalArgumentException(
114: "indexDivisor must be > 0: got " + indexDivisor);
115:
116: if (indexTerms != null)
117: throw new IllegalStateException(
118: "index terms are already loaded");
119:
120: this .indexDivisor = indexDivisor;
121: totalIndexInterval = origEnum.indexInterval * indexDivisor;
122: }
123:
124: /** Returns the indexDivisor.
125: * @see #setIndexDivisor
126: */
127: public int getIndexDivisor() {
128: return indexDivisor;
129: }
130:
131: final void close() throws IOException {
132: if (origEnum != null)
133: origEnum.close();
134: if (indexEnum != null)
135: indexEnum.close();
136: enumerators.set(null);
137: }
138:
139: /** Returns the number of term/value pairs in the set. */
140: final long size() {
141: return size;
142: }
143:
144: private SegmentTermEnum getEnum() {
145: SegmentTermEnum termEnum = (SegmentTermEnum) enumerators.get();
146: if (termEnum == null) {
147: termEnum = terms();
148: enumerators.set(termEnum);
149: }
150: return termEnum;
151: }
152:
153: private synchronized void ensureIndexIsRead() throws IOException {
154: if (indexTerms != null) // index already read
155: return; // do nothing
156: try {
157: int indexSize = 1 + ((int) indexEnum.size - 1)
158: / indexDivisor; // otherwise read index
159:
160: indexTerms = new Term[indexSize];
161: indexInfos = new TermInfo[indexSize];
162: indexPointers = new long[indexSize];
163:
164: for (int i = 0; indexEnum.next(); i++) {
165: indexTerms[i] = indexEnum.term();
166: indexInfos[i] = indexEnum.termInfo();
167: indexPointers[i] = indexEnum.indexPointer;
168:
169: for (int j = 1; j < indexDivisor; j++)
170: if (!indexEnum.next())
171: break;
172: }
173: } finally {
174: indexEnum.close();
175: indexEnum = null;
176: }
177: }
178:
179: /** Returns the offset of the greatest index entry which is less than or equal to term.*/
180: private final int getIndexOffset(Term term) {
181: int lo = 0; // binary search indexTerms[]
182: int hi = indexTerms.length - 1;
183:
184: while (hi >= lo) {
185: int mid = (lo + hi) >> 1;
186: int delta = term.compareTo(indexTerms[mid]);
187: if (delta < 0)
188: hi = mid - 1;
189: else if (delta > 0)
190: lo = mid + 1;
191: else
192: return mid;
193: }
194: return hi;
195: }
196:
197: private final void seekEnum(int indexOffset) throws IOException {
198: getEnum().seek(indexPointers[indexOffset],
199: (indexOffset * totalIndexInterval) - 1,
200: indexTerms[indexOffset], indexInfos[indexOffset]);
201: }
202:
203: /** Returns the TermInfo for a Term in the set, or null. */
204: TermInfo get(Term term) throws IOException {
205: if (size == 0)
206: return null;
207:
208: ensureIndexIsRead();
209:
210: // optimize sequential access: first try scanning cached enum w/o seeking
211: SegmentTermEnum enumerator = getEnum();
212: if (enumerator.term() != null // term is at or past current
213: && ((enumerator.prev() != null && term
214: .compareTo(enumerator.prev()) > 0) || term
215: .compareTo(enumerator.term()) >= 0)) {
216: int enumOffset = (int) (enumerator.position / totalIndexInterval) + 1;
217: if (indexTerms.length == enumOffset // but before end of block
218: || term.compareTo(indexTerms[enumOffset]) < 0)
219: return scanEnum(term); // no need to seek
220: }
221:
222: // random-access: must seek
223: seekEnum(getIndexOffset(term));
224: return scanEnum(term);
225: }
226:
227: /** Scans within block for matching term. */
228: private final TermInfo scanEnum(Term term) throws IOException {
229: SegmentTermEnum enumerator = getEnum();
230: enumerator.scanTo(term);
231: if (enumerator.term() != null
232: && term.compareTo(enumerator.term()) == 0)
233: return enumerator.termInfo();
234: else
235: return null;
236: }
237:
238: /** Returns the nth term in the set. */
239: final Term get(int position) throws IOException {
240: if (size == 0)
241: return null;
242:
243: SegmentTermEnum enumerator = getEnum();
244: if (enumerator != null
245: && enumerator.term() != null
246: && position >= enumerator.position
247: && position < (enumerator.position + totalIndexInterval))
248: return scanEnum(position); // can avoid seek
249:
250: seekEnum(position / totalIndexInterval); // must seek
251: return scanEnum(position);
252: }
253:
254: private final Term scanEnum(int position) throws IOException {
255: SegmentTermEnum enumerator = getEnum();
256: while (enumerator.position < position)
257: if (!enumerator.next())
258: return null;
259:
260: return enumerator.term();
261: }
262:
263: /** Returns the position of a Term in the set or -1. */
264: final long getPosition(Term term) throws IOException {
265: if (size == 0)
266: return -1;
267:
268: ensureIndexIsRead();
269: int indexOffset = getIndexOffset(term);
270: seekEnum(indexOffset);
271:
272: SegmentTermEnum enumerator = getEnum();
273: while (term.compareTo(enumerator.term()) > 0
274: && enumerator.next()) {
275: }
276:
277: if (term.compareTo(enumerator.term()) == 0)
278: return enumerator.position;
279: else
280: return -1;
281: }
282:
283: /** Returns an enumeration of all the Terms and TermInfos in the set. */
284: public SegmentTermEnum terms() {
285: return (SegmentTermEnum) origEnum.clone();
286: }
287:
288: /** Returns an enumeration of terms starting at or after the named term. */
289: public SegmentTermEnum terms(Term term) throws IOException {
290: get(term);
291: return (SegmentTermEnum) getEnum().clone();
292: }
293: }
|