001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.store.BufferedIndexInput;
021: import org.apache.lucene.store.Directory;
022: import org.apache.lucene.store.IndexInput;
023:
024: import java.io.IOException;
025:
026: /**
027: * @version $Id: TermVectorsReader.java 601337 2007-12-05 13:59:37Z mikemccand $
028: */
029: class TermVectorsReader implements Cloneable {
030:
031: static final int FORMAT_VERSION = 2;
032: //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
033: static final int FORMAT_SIZE = 4;
034:
035: static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
036: static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
037:
038: private FieldInfos fieldInfos;
039:
040: private IndexInput tvx;
041: private IndexInput tvd;
042: private IndexInput tvf;
043: private int size;
044:
045: // The docID offset where our docs begin in the index
046: // file. This will be 0 if we have our own private file.
047: private int docStoreOffset;
048:
049: private int tvdFormat;
050: private int tvfFormat;
051:
052: TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
053: throws CorruptIndexException, IOException {
054: this (d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);
055: }
056:
057: TermVectorsReader(Directory d, String segment,
058: FieldInfos fieldInfos, int readBufferSize)
059: throws CorruptIndexException, IOException {
060: this (d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE,
061: -1, 0);
062: }
063:
064: TermVectorsReader(Directory d, String segment,
065: FieldInfos fieldInfos, int readBufferSize,
066: int docStoreOffset, int size) throws CorruptIndexException,
067: IOException {
068: boolean success = false;
069:
070: try {
071: if (d.fileExists(segment + "."
072: + IndexFileNames.VECTORS_INDEX_EXTENSION)) {
073: tvx = d.openInput(segment + "."
074: + IndexFileNames.VECTORS_INDEX_EXTENSION,
075: readBufferSize);
076: checkValidFormat(tvx);
077: tvd = d.openInput(segment + "."
078: + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION,
079: readBufferSize);
080: tvdFormat = checkValidFormat(tvd);
081: tvf = d.openInput(segment + "."
082: + IndexFileNames.VECTORS_FIELDS_EXTENSION,
083: readBufferSize);
084: tvfFormat = checkValidFormat(tvf);
085: if (-1 == docStoreOffset) {
086: this .docStoreOffset = 0;
087: this .size = (int) (tvx.length() >> 3);
088: } else {
089: this .docStoreOffset = docStoreOffset;
090: this .size = size;
091: // Verify the file is long enough to hold all of our
092: // docs
093: assert ((int) (tvx.length() / 8)) >= size
094: + docStoreOffset;
095: }
096: }
097:
098: this .fieldInfos = fieldInfos;
099: success = true;
100: } finally {
101: // With lock-less commits, it's entirely possible (and
102: // fine) to hit a FileNotFound exception above. In
103: // this case, we want to explicitly close any subset
104: // of things that were opened so that we don't have to
105: // wait for a GC to do so.
106: if (!success) {
107: close();
108: }
109: }
110: }
111:
112: private int checkValidFormat(IndexInput in)
113: throws CorruptIndexException, IOException {
114: int format = in.readInt();
115: if (format > FORMAT_VERSION) {
116: throw new CorruptIndexException(
117: "Incompatible format version: " + format
118: + " expected " + FORMAT_VERSION
119: + " or less");
120: }
121: return format;
122: }
123:
124: void close() throws IOException {
125: // make all effort to close up. Keep the first exception
126: // and throw it as a new one.
127: IOException keep = null;
128: if (tvx != null)
129: try {
130: tvx.close();
131: } catch (IOException e) {
132: if (keep == null)
133: keep = e;
134: }
135: if (tvd != null)
136: try {
137: tvd.close();
138: } catch (IOException e) {
139: if (keep == null)
140: keep = e;
141: }
142: if (tvf != null)
143: try {
144: tvf.close();
145: } catch (IOException e) {
146: if (keep == null)
147: keep = e;
148: }
149: if (keep != null)
150: throw (IOException) keep.fillInStackTrace();
151: }
152:
153: /**
154: *
155: * @return The number of documents in the reader
156: */
157: int size() {
158: return size;
159: }
160:
161: public void get(int docNum, String field, TermVectorMapper mapper)
162: throws IOException {
163: if (tvx != null) {
164: int fieldNumber = fieldInfos.fieldNumber(field);
165: //We need to account for the FORMAT_SIZE at when seeking in the tvx
166: //We don't need to do this in other seeks because we already have the
167: // file pointer
168: //that was written in another file
169: tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
170: //System.out.println("TVX Pointer: " + tvx.getFilePointer());
171: long position = tvx.readLong();
172:
173: tvd.seek(position);
174: int fieldCount = tvd.readVInt();
175: //System.out.println("Num Fields: " + fieldCount);
176: // There are only a few fields per document. We opt for a full scan
177: // rather then requiring that they be ordered. We need to read through
178: // all of the fields anyway to get to the tvf pointers.
179: int number = 0;
180: int found = -1;
181: for (int i = 0; i < fieldCount; i++) {
182: if (tvdFormat == FORMAT_VERSION)
183: number = tvd.readVInt();
184: else
185: number += tvd.readVInt();
186:
187: if (number == fieldNumber)
188: found = i;
189: }
190:
191: // This field, although valid in the segment, was not found in this
192: // document
193: if (found != -1) {
194: // Compute position in the tvf file
195: position = 0;
196: for (int i = 0; i <= found; i++)
197: position += tvd.readVLong();
198:
199: mapper.setDocumentNumber(docNum);
200: readTermVector(field, position, mapper);
201: } else {
202: //System.out.println("Fieldable not found");
203: }
204: } else {
205: //System.out.println("No tvx file");
206: }
207: }
208:
209: /**
210: * Retrieve the term vector for the given document and field
211: * @param docNum The document number to retrieve the vector for
212: * @param field The field within the document to retrieve
213: * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
214: * @throws IOException if there is an error reading the term vector files
215: */
216: TermFreqVector get(int docNum, String field) throws IOException {
217: // Check if no term vectors are available for this segment at all
218: ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
219: get(docNum, field, mapper);
220:
221: return mapper.materializeVector();
222: }
223:
224: /**
225: * Return all term vectors stored for this document or null if the could not be read in.
226: *
227: * @param docNum The document number to retrieve the vector for
228: * @return All term frequency vectors
229: * @throws IOException if there is an error reading the term vector files
230: */
231: TermFreqVector[] get(int docNum) throws IOException {
232: TermFreqVector[] result = null;
233: if (tvx != null) {
234: //We need to offset by
235: tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
236: long position = tvx.readLong();
237:
238: tvd.seek(position);
239: int fieldCount = tvd.readVInt();
240:
241: // No fields are vectorized for this document
242: if (fieldCount != 0) {
243: int number = 0;
244: String[] fields = new String[fieldCount];
245:
246: for (int i = 0; i < fieldCount; i++) {
247: if (tvdFormat == FORMAT_VERSION)
248: number = tvd.readVInt();
249: else
250: number += tvd.readVInt();
251:
252: fields[i] = fieldInfos.fieldName(number);
253: }
254:
255: // Compute position in the tvf file
256: position = 0;
257: long[] tvfPointers = new long[fieldCount];
258: for (int i = 0; i < fieldCount; i++) {
259: position += tvd.readVLong();
260: tvfPointers[i] = position;
261: }
262:
263: result = readTermVectors(docNum, fields, tvfPointers);
264: }
265: } else {
266: //System.out.println("No tvx file");
267: }
268: return result;
269: }
270:
271: public void get(int docNumber, TermVectorMapper mapper)
272: throws IOException {
273: // Check if no term vectors are available for this segment at all
274: if (tvx != null) {
275: //We need to offset by
276: tvx.seek((docNumber * 8L) + FORMAT_SIZE);
277: long position = tvx.readLong();
278:
279: tvd.seek(position);
280: int fieldCount = tvd.readVInt();
281:
282: // No fields are vectorized for this document
283: if (fieldCount != 0) {
284: int number = 0;
285: String[] fields = new String[fieldCount];
286:
287: for (int i = 0; i < fieldCount; i++) {
288: if (tvdFormat == FORMAT_VERSION)
289: number = tvd.readVInt();
290: else
291: number += tvd.readVInt();
292:
293: fields[i] = fieldInfos.fieldName(number);
294: }
295:
296: // Compute position in the tvf file
297: position = 0;
298: long[] tvfPointers = new long[fieldCount];
299: for (int i = 0; i < fieldCount; i++) {
300: position += tvd.readVLong();
301: tvfPointers[i] = position;
302: }
303:
304: mapper.setDocumentNumber(docNumber);
305: readTermVectors(fields, tvfPointers, mapper);
306: }
307: } else {
308: //System.out.println("No tvx file");
309: }
310: }
311:
312: private SegmentTermVector[] readTermVectors(int docNum,
313: String fields[], long tvfPointers[]) throws IOException {
314: SegmentTermVector res[] = new SegmentTermVector[fields.length];
315: for (int i = 0; i < fields.length; i++) {
316: ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
317: mapper.setDocumentNumber(docNum);
318: readTermVector(fields[i], tvfPointers[i], mapper);
319: res[i] = (SegmentTermVector) mapper.materializeVector();
320: }
321: return res;
322: }
323:
324: private void readTermVectors(String fields[], long tvfPointers[],
325: TermVectorMapper mapper) throws IOException {
326: for (int i = 0; i < fields.length; i++) {
327: readTermVector(fields[i], tvfPointers[i], mapper);
328: }
329:
330: }
331:
332: /**
333: *
334: * @param field The field to read in
335: * @param tvfPointer The pointer within the tvf file where we should start reading
336: * @param mapper The mapper used to map the TermVector
337: * @return The TermVector located at that position
338: * @throws IOException
339:
340: */
341: private void readTermVector(String field, long tvfPointer,
342: TermVectorMapper mapper) throws IOException {
343:
344: // Now read the data from specified position
345: //We don't need to offset by the FORMAT here since the pointer already includes the offset
346: tvf.seek(tvfPointer);
347:
348: int numTerms = tvf.readVInt();
349: //System.out.println("Num Terms: " + numTerms);
350: // If no terms - return a constant empty termvector. However, this should never occur!
351: if (numTerms == 0)
352: return;
353:
354: boolean storePositions;
355: boolean storeOffsets;
356:
357: if (tvfFormat == FORMAT_VERSION) {
358: byte bits = tvf.readByte();
359: storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
360: storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
361: } else {
362: tvf.readVInt();
363: storePositions = false;
364: storeOffsets = false;
365: }
366: mapper.setExpectations(field, numTerms, storeOffsets,
367: storePositions);
368: int start = 0;
369: int deltaLength = 0;
370: int totalLength = 0;
371: char[] buffer = new char[10]; // init the buffer with a length of 10 character
372: char[] previousBuffer = {};
373:
374: for (int i = 0; i < numTerms; i++) {
375: start = tvf.readVInt();
376: deltaLength = tvf.readVInt();
377: totalLength = start + deltaLength;
378: if (buffer.length < totalLength) { // increase buffer
379: buffer = null; // give a hint to garbage collector
380: buffer = new char[totalLength];
381:
382: if (start > 0) // just copy if necessary
383: System.arraycopy(previousBuffer, 0, buffer, 0,
384: start);
385: }
386:
387: tvf.readChars(buffer, start, deltaLength);
388: String term = new String(buffer, 0, totalLength);
389: previousBuffer = buffer;
390: int freq = tvf.readVInt();
391: int[] positions = null;
392: if (storePositions) { //read in the positions
393: //does the mapper even care about positions?
394: if (mapper.isIgnoringPositions() == false) {
395: positions = new int[freq];
396: int prevPosition = 0;
397: for (int j = 0; j < freq; j++) {
398: positions[j] = prevPosition + tvf.readVInt();
399: prevPosition = positions[j];
400: }
401: } else {
402: //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
403: //
404: for (int j = 0; j < freq; j++) {
405: tvf.readVInt();
406: }
407: }
408: }
409: TermVectorOffsetInfo[] offsets = null;
410: if (storeOffsets) {
411: //does the mapper even care about offsets?
412: if (mapper.isIgnoringOffsets() == false) {
413: offsets = new TermVectorOffsetInfo[freq];
414: int prevOffset = 0;
415: for (int j = 0; j < freq; j++) {
416: int startOffset = prevOffset + tvf.readVInt();
417: int endOffset = startOffset + tvf.readVInt();
418: offsets[j] = new TermVectorOffsetInfo(
419: startOffset, endOffset);
420: prevOffset = endOffset;
421: }
422: } else {
423: for (int j = 0; j < freq; j++) {
424: tvf.readVInt();
425: tvf.readVInt();
426: }
427: }
428: }
429: mapper.map(term, freq, offsets, positions);
430: }
431: }
432:
433: protected Object clone() {
434:
435: if (tvx == null || tvd == null || tvf == null)
436: return null;
437:
438: TermVectorsReader clone = null;
439: try {
440: clone = (TermVectorsReader) super .clone();
441: } catch (CloneNotSupportedException e) {
442: }
443:
444: clone.tvx = (IndexInput) tvx.clone();
445: clone.tvd = (IndexInput) tvd.clone();
446: clone.tvf = (IndexInput) tvf.clone();
447:
448: return clone;
449: }
450:
451: }
452:
453: /**
454: * Models the existing parallel array structure
455: */
456: class ParallelArrayTermVectorMapper extends TermVectorMapper {
457:
458: private String[] terms;
459: private int[] termFreqs;
460: private int positions[][];
461: private TermVectorOffsetInfo offsets[][];
462: private int currentPosition;
463: private boolean storingOffsets;
464: private boolean storingPositions;
465: private String field;
466:
467: public void setExpectations(String field, int numTerms,
468: boolean storeOffsets, boolean storePositions) {
469: this .field = field;
470: terms = new String[numTerms];
471: termFreqs = new int[numTerms];
472: this .storingOffsets = storeOffsets;
473: this .storingPositions = storePositions;
474: if (storePositions)
475: this .positions = new int[numTerms][];
476: if (storeOffsets)
477: this .offsets = new TermVectorOffsetInfo[numTerms][];
478: }
479:
480: public void map(String term, int frequency,
481: TermVectorOffsetInfo[] offsets, int[] positions) {
482: terms[currentPosition] = term;
483: termFreqs[currentPosition] = frequency;
484: if (storingOffsets) {
485: this .offsets[currentPosition] = offsets;
486: }
487: if (storingPositions) {
488: this .positions[currentPosition] = positions;
489: }
490: currentPosition++;
491: }
492:
493: /**
494: * Construct the vector
495: * @return The {@link TermFreqVector} based on the mappings.
496: */
497: public TermFreqVector materializeVector() {
498: SegmentTermVector tv = null;
499: if (field != null && terms != null) {
500: if (storingPositions || storingOffsets) {
501: tv = new SegmentTermPositionVector(field, terms,
502: termFreqs, positions, offsets);
503: } else {
504: tv = new SegmentTermVector(field, terms, termFreqs);
505: }
506: }
507: return tv;
508: }
509: }
|