001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.util.Set;
022:
023: import org.apache.lucene.store.Directory;
024:
025: /** <p>This class implements a {@link MergePolicy} that tries
026: * to merge segments into levels of exponentially
027: * increasing size, where each level has < mergeFactor
028: * segments in it. Whenever a given levle has mergeFactor
029: * segments or more in it, they will be merged.</p>
030: *
031: * <p>This class is abstract and requires a subclass to
032: * define the {@link #size} method which specifies how a
033: * segment's size is determined. {@link LogDocMergePolicy}
034: * is one subclass that measures size by document count in
035: * the segment. {@link LogByteSizeMergePolicy} is another
036: * subclass that measures size as the total byte size of the
037: * file(s) for the segment.</p>
038: */
039:
040: public abstract class LogMergePolicy extends MergePolicy {
041:
042: /** Defines the allowed range of log(size) for each
043: * level. A level is computed by taking the max segment
044: * log size, minuse LEVEL_LOG_SPAN, and finding all
045: * segments falling within that range. */
046: public static final double LEVEL_LOG_SPAN = 0.75;
047:
048: /** Default merge factor, which is how many segments are
049: * merged at a time */
050: public static final int DEFAULT_MERGE_FACTOR = 10;
051:
052: /** Default maximum segment size. A segment of this size
053: * or larger will never be merged. @see setMaxMergeDocs */
054: public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;
055:
056: private int mergeFactor = DEFAULT_MERGE_FACTOR;
057:
058: long minMergeSize;
059: long maxMergeSize;
060: int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
061:
062: private boolean useCompoundFile = true;
063: private boolean useCompoundDocStore = true;
064: private IndexWriter writer;
065:
066: private void message(String message) {
067: if (writer != null)
068: writer.message("LMP: " + message);
069: }
070:
071: /** <p>Returns the number of segments that are merged at
072: * once and also controls the total number of segments
073: * allowed to accumulate in the index.</p> */
074: public int getMergeFactor() {
075: return mergeFactor;
076: }
077:
078: /** Determines how often segment indices are merged by
079: * addDocument(). With smaller values, less RAM is used
080: * while indexing, and searches on unoptimized indices are
081: * faster, but indexing speed is slower. With larger
082: * values, more RAM is used during indexing, and while
083: * searches on unoptimized indices are slower, indexing is
084: * faster. Thus larger values (> 10) are best for batch
085: * index creation, and smaller values (< 10) for indices
086: * that are interactively maintained. */
087: public void setMergeFactor(int mergeFactor) {
088: if (mergeFactor < 2)
089: throw new IllegalArgumentException(
090: "mergeFactor cannot be less than 2");
091: this .mergeFactor = mergeFactor;
092: }
093:
094: // Javadoc inherited
095: public boolean useCompoundFile(SegmentInfos infos, SegmentInfo info) {
096: return useCompoundFile;
097: }
098:
099: /** Sets whether compound file format should be used for
100: * newly flushed and newly merged segments. */
101: public void setUseCompoundFile(boolean useCompoundFile) {
102: this .useCompoundFile = useCompoundFile;
103: }
104:
105: /** Returns true if newly flushed and newly merge segments
106: * are written in compound file format. @see
107: * #setUseCompoundFile */
108: public boolean getUseCompoundFile() {
109: return useCompoundFile;
110: }
111:
112: // Javadoc inherited
113: public boolean useCompoundDocStore(SegmentInfos infos) {
114: return useCompoundDocStore;
115: }
116:
117: /** Sets whether compound file format should be used for
118: * newly flushed and newly merged doc store
119: * segment files (term vectors and stored fields). */
120: public void setUseCompoundDocStore(boolean useCompoundDocStore) {
121: this .useCompoundDocStore = useCompoundDocStore;
122: }
123:
124: /** Returns true if newly flushed and newly merge doc
125: * store segment files (term vectors and stored fields)
126: * are written in compound file format. @see
127: * #setUseCompoundDocStore */
128: public boolean getUseCompoundDocStore() {
129: return useCompoundDocStore;
130: }
131:
132: public void close() {
133: }
134:
135: abstract protected long size(SegmentInfo info) throws IOException;
136:
137: private boolean isOptimized(SegmentInfos infos, IndexWriter writer,
138: int maxNumSegments, Set segmentsToOptimize)
139: throws IOException {
140: final int numSegments = infos.size();
141: int numToOptimize = 0;
142: SegmentInfo optimizeInfo = null;
143: for (int i = 0; i < numSegments
144: && numToOptimize <= maxNumSegments; i++) {
145: final SegmentInfo info = infos.info(i);
146: if (segmentsToOptimize.contains(info)) {
147: numToOptimize++;
148: optimizeInfo = info;
149: }
150: }
151:
152: return numToOptimize <= maxNumSegments
153: && (numToOptimize != 1 || isOptimized(writer,
154: optimizeInfo));
155: }
156:
157: /** Returns true if this single nfo is optimized (has no
158: * pending norms or deletes, is in the same dir as the
159: * writer, and matches the current compound file setting */
160: private boolean isOptimized(IndexWriter writer, SegmentInfo info)
161: throws IOException {
162: return !info.hasDeletions() && !info.hasSeparateNorms()
163: && info.dir == writer.getDirectory()
164: && info.getUseCompoundFile() == useCompoundFile;
165: }
166:
167: /** Returns the merges necessary to optimize the index.
168: * This merge policy defines "optimized" to mean only one
169: * segment in the index, where that segment has no
170: * deletions pending nor separate norms, and it is in
171: * compound file format if the current useCompoundFile
172: * setting is true. This method returns multiple merges
173: * (mergeFactor at a time) so the {@link MergeScheduler}
174: * in use may make use of concurrency. */
175: public MergeSpecification findMergesForOptimize(SegmentInfos infos,
176: IndexWriter writer, int maxNumSegments,
177: Set segmentsToOptimize) throws IOException {
178: MergeSpecification spec;
179:
180: assert maxNumSegments > 0;
181:
182: if (!isOptimized(infos, writer, maxNumSegments,
183: segmentsToOptimize)) {
184:
185: // Find the newest (rightmost) segment that needs to
186: // be optimized (other segments may have been flushed
187: // since optimize started):
188: int last = infos.size();
189: while (last > 0) {
190: final SegmentInfo info = infos.info(--last);
191: if (segmentsToOptimize.contains(info)) {
192: last++;
193: break;
194: }
195: }
196:
197: if (last > 0) {
198:
199: spec = new MergeSpecification();
200:
201: // First, enroll all "full" merges (size
202: // mergeFactor) to potentially be run concurrently:
203: while (last - maxNumSegments + 1 >= mergeFactor) {
204: spec.add(new OneMerge(infos.range(last
205: - mergeFactor, last), useCompoundFile));
206: last -= mergeFactor;
207: }
208:
209: // Only if there are no full merges pending do we
210: // add a final partial (< mergeFactor segments) merge:
211: if (0 == spec.merges.size()) {
212: if (maxNumSegments == 1) {
213:
214: // Since we must optimize down to 1 segment, the
215: // choice is simple:
216: if (last > 1
217: || !isOptimized(writer, infos.info(0)))
218: spec.add(new OneMerge(infos.range(0, last),
219: useCompoundFile));
220: } else if (last > maxNumSegments) {
221:
222: // Take care to pick a partial merge that is
223: // least cost, but does not make the index too
224: // lopsided. If we always just picked the
225: // partial tail then we could produce a highly
226: // lopsided index over time:
227:
228: // We must merge this many segments to leave
229: // maxNumSegments in the index (from when
230: // optimize was first kicked off):
231: final int finalMergeSize = last
232: - maxNumSegments + 1;
233:
234: // Consider all possible starting points:
235: long bestSize = 0;
236: int bestStart = 0;
237:
238: for (int i = 0; i < last - finalMergeSize + 1; i++) {
239: long sumSize = 0;
240: for (int j = 0; j < finalMergeSize; j++)
241: sumSize += size(infos.info(j + i));
242: if (i == 0
243: || (sumSize < 2 * size(infos
244: .info(i - 1)) && sumSize < bestSize)) {
245: bestStart = i;
246: bestSize = sumSize;
247: }
248: }
249:
250: spec.add(new OneMerge(infos.range(bestStart,
251: bestStart + finalMergeSize),
252: useCompoundFile));
253: }
254: }
255:
256: } else
257: spec = null;
258: } else
259: spec = null;
260:
261: return spec;
262: }
263:
264: /** Checks if any merges are now necessary and returns a
265: * {@link MergePolicy.MergeSpecification} if so. A merge
266: * is necessary when there are more than {@link
267: * #setMergeFactor} segments at a given level. When
268: * multiple levels have too many segments, this method
269: * will return multiple merges, allowing the {@link
270: * MergeScheduler} to use concurrency. */
271: public MergeSpecification findMerges(SegmentInfos infos,
272: IndexWriter writer) throws IOException {
273:
274: final int numSegments = infos.size();
275: this .writer = writer;
276: message("findMerges: " + numSegments + " segments");
277:
278: // Compute levels, which is just log (base mergeFactor)
279: // of the size of each segment
280: float[] levels = new float[numSegments];
281: final float norm = (float) Math.log(mergeFactor);
282:
283: final Directory directory = writer.getDirectory();
284:
285: for (int i = 0; i < numSegments; i++) {
286: final SegmentInfo info = infos.info(i);
287: long size = size(info);
288:
289: // Refuse to import a segment that's too large
290: if (info.docCount > maxMergeDocs && info.dir != directory)
291: throw new IllegalArgumentException(
292: "Segment is too large (" + info.docCount
293: + " docs vs max docs " + maxMergeDocs
294: + ")");
295:
296: if (size >= maxMergeSize && info.dir != directory)
297: throw new IllegalArgumentException(
298: "Segment is too large (" + size
299: + " vs max size " + maxMergeSize + ")");
300:
301: // Floor tiny segments
302: if (size < 1)
303: size = 1;
304: levels[i] = (float) Math.log(size) / norm;
305: }
306:
307: final float levelFloor;
308: if (minMergeSize <= 0)
309: levelFloor = (float) 0.0;
310: else
311: levelFloor = (float) (Math.log(minMergeSize) / norm);
312:
313: // Now, we quantize the log values into levels. The
314: // first level is any segment whose log size is within
315: // LEVEL_LOG_SPAN of the max size, or, who has such as
316: // segment "to the right". Then, we find the max of all
317: // other segments and use that to define the next level
318: // segment, etc.
319:
320: MergeSpecification spec = null;
321:
322: int start = 0;
323: while (start < numSegments) {
324:
325: // Find max level of all segments not already
326: // quantized.
327: float maxLevel = levels[start];
328: for (int i = 1 + start; i < numSegments; i++) {
329: final float level = levels[i];
330: if (level > maxLevel)
331: maxLevel = level;
332: }
333:
334: // Now search backwards for the rightmost segment that
335: // falls into this level:
336: float levelBottom;
337: if (maxLevel < levelFloor)
338: // All remaining segments fall into the min level
339: levelBottom = -1.0F;
340: else {
341: levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
342:
343: // Force a boundary at the level floor
344: if (levelBottom < levelFloor && maxLevel >= levelFloor)
345: levelBottom = levelFloor;
346: }
347:
348: int upto = numSegments - 1;
349: while (upto >= start) {
350: if (levels[upto] >= levelBottom) {
351: break;
352: }
353: upto--;
354: }
355: message(" level " + levelBottom + " to " + maxLevel + ": "
356: + (1 + upto - start) + " segments");
357:
358: // Finally, record all merges that are viable at this level:
359: int end = start + mergeFactor;
360: while (end <= 1 + upto) {
361: boolean anyTooLarge = false;
362: for (int i = start; i < end; i++) {
363: final SegmentInfo info = infos.info(i);
364: anyTooLarge |= (size(info) >= maxMergeSize || info.docCount >= maxMergeDocs);
365: }
366:
367: if (!anyTooLarge) {
368: if (spec == null)
369: spec = new MergeSpecification();
370: message(" " + start + " to " + end
371: + ": add this merge");
372: spec.add(new OneMerge(infos.range(start, end),
373: useCompoundFile));
374: } else
375: message(" "
376: + start
377: + " to "
378: + end
379: + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
380:
381: start = end;
382: end = start + mergeFactor;
383: }
384:
385: start = 1 + upto;
386: }
387:
388: return spec;
389: }
390:
391: /** <p>Determines the largest segment (measured by
392: * document count) that may be merged with other segments.
393: * Small values (e.g., less than 10,000) are best for
394: * interactive indexing, as this limits the length of
395: * pauses while indexing to a few seconds. Larger values
396: * are best for batched indexing and speedier
397: * searches.</p>
398: *
399: * <p>The default value is {@link Integer#MAX_VALUE}.</p>
400: *
401: * <p>The default merge policy ({@link
402: * LogByteSizeMergePolicy}) also allows you to set this
403: * limit by net size (in MB) of the segment, using {@link
404: * LogByteSizeMergePolicy#setMaxMergeMB}.</p>
405: */
406: public void setMaxMergeDocs(int maxMergeDocs) {
407: this .maxMergeDocs = maxMergeDocs;
408: }
409:
410: /** Returns the largest segment (measured by document
411: * count) that may be merged with other segments.
412: * @see #setMaxMergeDocs */
413: public int getMaxMergeDocs() {
414: return maxMergeDocs;
415: }
416:
417: }
|