001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import org.apache.lucene.store.IndexOutput;
022: import org.apache.lucene.store.Directory;
023:
024: /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
025: Directory. A TermInfos can be written once, in order. */
026:
027: final class TermInfosWriter {
028: /** The file format version, a negative number. */
029: public static final int FORMAT = -3;
030:
031: private FieldInfos fieldInfos;
032: private IndexOutput output;
033: private TermInfo lastTi = new TermInfo();
034: private long size;
035:
036: // TODO: the default values for these two parameters should be settable from
037: // IndexWriter. However, once that's done, folks will start setting them to
038: // ridiculous values and complaining that things don't work well, as with
039: // mergeFactor. So, let's wait until a number of folks find that alternate
040: // values work better. Note that both of these values are stored in the
041: // segment, so that it's safe to change these w/o rebuilding all indexes.
042:
043: /** Expert: The fraction of terms in the "dictionary" which should be stored
044: * in RAM. Smaller values use more memory, but make searching slightly
045: * faster, while larger values use less memory and make searching slightly
046: * slower. Searching is typically not dominated by dictionary lookup, so
047: * tweaking this is rarely useful.*/
048: int indexInterval = 128;
049:
050: /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
051: * used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in
052: * smaller indexes, greater acceleration, but fewer accelerable cases, while
053: * smaller values result in bigger indexes, less acceleration and more
054: * accelerable cases. More detailed experiments would be useful here. */
055: int skipInterval = 16;
056:
057: /** Expert: The maximum number of skip levels. Smaller values result in
058: * slightly smaller indexes, but slower skipping in big posting lists.
059: */
060: int maxSkipLevels = 10;
061:
062: private long lastIndexPointer;
063: private boolean isIndex;
064: private char[] lastTermText = new char[10];
065: private int lastTermTextLength;
066: private int lastFieldNumber = -1;
067:
068: private char[] termTextBuffer = new char[10];
069:
070: private TermInfosWriter other;
071:
072: TermInfosWriter(Directory directory, String segment,
073: FieldInfos fis, int interval) throws IOException {
074: initialize(directory, segment, fis, interval, false);
075: other = new TermInfosWriter(directory, segment, fis, interval,
076: true);
077: other.other = this ;
078: }
079:
080: private TermInfosWriter(Directory directory, String segment,
081: FieldInfos fis, int interval, boolean isIndex)
082: throws IOException {
083: initialize(directory, segment, fis, interval, isIndex);
084: }
085:
086: private void initialize(Directory directory, String segment,
087: FieldInfos fis, int interval, boolean isi)
088: throws IOException {
089: indexInterval = interval;
090: fieldInfos = fis;
091: isIndex = isi;
092: output = directory.createOutput(segment
093: + (isIndex ? ".tii" : ".tis"));
094: output.writeInt(FORMAT); // write format
095: output.writeLong(0); // leave space for size
096: output.writeInt(indexInterval); // write indexInterval
097: output.writeInt(skipInterval); // write skipInterval
098: output.writeInt(maxSkipLevels); // write maxSkipLevels
099: }
100:
101: void add(Term term, TermInfo ti) throws IOException {
102:
103: final int length = term.text.length();
104: if (termTextBuffer.length < length)
105: termTextBuffer = new char[(int) (length * 1.25)];
106:
107: term.text.getChars(0, length, termTextBuffer, 0);
108:
109: add(fieldInfos.fieldNumber(term.field), termTextBuffer, 0,
110: length, ti);
111: }
112:
113: // Currently used only by assert statement
114: private int compareToLastTerm(int fieldNumber, char[] termText,
115: int start, int length) {
116: int pos = 0;
117:
118: if (lastFieldNumber != fieldNumber) {
119: final int cmp = fieldInfos.fieldName(lastFieldNumber)
120: .compareTo(fieldInfos.fieldName(fieldNumber));
121: // If there is a field named "" (empty string) then we
122: // will get 0 on this comparison, yet, it's "OK". But
123: // it's not OK if two different field numbers map to
124: // the same name.
125: if (cmp != 0 || lastFieldNumber != -1)
126: return cmp;
127: }
128:
129: while (pos < length && pos < lastTermTextLength) {
130: final char c1 = lastTermText[pos];
131: final char c2 = termText[pos + start];
132: if (c1 < c2)
133: return -1;
134: else if (c1 > c2)
135: return 1;
136: pos++;
137: }
138:
139: if (pos < lastTermTextLength)
140: // Last term was longer
141: return 1;
142: else if (pos < length)
143: // Last term was shorter
144: return -1;
145: else
146: return 0;
147: }
148:
149: /** Adds a new <<fieldNumber, termText>, TermInfo> pair to the set.
150: Term must be lexicographically greater than all previous Terms added.
151: TermInfo pointers must be positive and greater than all previous.*/
152: void add(int fieldNumber, char[] termText, int termTextStart,
153: int termTextLength, TermInfo ti) throws IOException {
154:
155: assert compareToLastTerm(fieldNumber, termText, termTextStart,
156: termTextLength) < 0
157: || (isIndex && termTextLength == 0 && lastTermTextLength == 0) : "Terms are out of order: field="
158: + fieldInfos.fieldName(fieldNumber)
159: + " (number "
160: + fieldNumber
161: + ")"
162: + " lastField="
163: + fieldInfos.fieldName(lastFieldNumber)
164: + " (number "
165: + lastFieldNumber
166: + ")"
167: + " text="
168: + new String(termText, termTextStart, termTextLength)
169: + " lastText="
170: + new String(lastTermText, 0, lastTermTextLength);
171:
172: assert ti.freqPointer >= lastTi.freqPointer : "freqPointer out of order ("
173: + ti.freqPointer + " < " + lastTi.freqPointer + ")";
174: assert ti.proxPointer >= lastTi.proxPointer : "proxPointer out of order ("
175: + ti.proxPointer + " < " + lastTi.proxPointer + ")";
176:
177: if (!isIndex && size % indexInterval == 0)
178: other.add(lastFieldNumber, lastTermText, 0,
179: lastTermTextLength, lastTi); // add an index term
180:
181: writeTerm(fieldNumber, termText, termTextStart, termTextLength); // write term
182:
183: output.writeVInt(ti.docFreq); // write doc freq
184: output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
185: output.writeVLong(ti.proxPointer - lastTi.proxPointer);
186:
187: if (ti.docFreq >= skipInterval) {
188: output.writeVInt(ti.skipOffset);
189: }
190:
191: if (isIndex) {
192: output.writeVLong(other.output.getFilePointer()
193: - lastIndexPointer);
194: lastIndexPointer = other.output.getFilePointer(); // write pointer
195: }
196:
197: if (lastTermText.length < termTextLength)
198: lastTermText = new char[(int) (termTextLength * 1.25)];
199: System.arraycopy(termText, termTextStart, lastTermText, 0,
200: termTextLength);
201: lastTermTextLength = termTextLength;
202: lastFieldNumber = fieldNumber;
203:
204: lastTi.set(ti);
205: size++;
206: }
207:
208: private void writeTerm(int fieldNumber, char[] termText,
209: int termTextStart, int termTextLength) throws IOException {
210:
211: // Compute prefix in common with last term:
212: int start = 0;
213: final int limit = termTextLength < lastTermTextLength ? termTextLength
214: : lastTermTextLength;
215: while (start < limit) {
216: if (termText[termTextStart + start] != lastTermText[start])
217: break;
218: start++;
219: }
220:
221: int length = termTextLength - start;
222:
223: output.writeVInt(start); // write shared prefix length
224: output.writeVInt(length); // write delta length
225: output.writeChars(termText, start + termTextStart, length); // write delta chars
226: output.writeVInt(fieldNumber); // write field num
227: }
228:
229: /** Called to complete TermInfos creation. */
230: void close() throws IOException {
231: output.seek(4); // write size after format
232: output.writeLong(size);
233: output.close();
234:
235: if (!isIndex)
236: other.close();
237: }
238:
239: }
|