001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.TokenStream;
021: import org.apache.lucene.document.*;
022: import org.apache.lucene.store.Directory;
023: import org.apache.lucene.store.IndexInput;
024: import org.apache.lucene.store.AlreadyClosedException;
025: import org.apache.lucene.store.BufferedIndexInput;
026:
027: import java.io.ByteArrayOutputStream;
028: import java.io.IOException;
029: import java.io.Reader;
030: import java.util.zip.DataFormatException;
031: import java.util.zip.Inflater;
032:
033: /**
034: * Class responsible for access to stored document fields.
035: * <p/>
036: * It uses <segment>.fdt and <segment>.fdx; files.
037: *
038: * @version $Id: FieldsReader.java 620759 2008-02-12 11:10:21Z mikemccand $
039: */
040: final class FieldsReader {
041: private final FieldInfos fieldInfos;
042:
043: // The main fieldStream, used only for cloning.
044: private final IndexInput cloneableFieldsStream;
045:
046: // This is a clone of cloneableFieldsStream used for reading documents.
047: // It should not be cloned outside of a synchronized context.
048: private final IndexInput fieldsStream;
049:
050: private final IndexInput indexStream;
051: private int numTotalDocs;
052: private int size;
053: private boolean closed;
054:
055: // The docID offset where our docs begin in the index
056: // file. This will be 0 if we have our own private file.
057: private int docStoreOffset;
058:
059: private ThreadLocal fieldsStreamTL = new ThreadLocal();
060:
061: FieldsReader(Directory d, String segment, FieldInfos fn)
062: throws IOException {
063: this (d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);
064: }
065:
066: FieldsReader(Directory d, String segment, FieldInfos fn,
067: int readBufferSize) throws IOException {
068: this (d, segment, fn, readBufferSize, -1, 0);
069: }
070:
071: FieldsReader(Directory d, String segment, FieldInfos fn,
072: int readBufferSize, int docStoreOffset, int size)
073: throws IOException {
074: boolean success = false;
075:
076: try {
077: fieldInfos = fn;
078:
079: cloneableFieldsStream = d.openInput(segment + ".fdt",
080: readBufferSize);
081: fieldsStream = (IndexInput) cloneableFieldsStream.clone();
082: indexStream = d.openInput(segment + ".fdx", readBufferSize);
083:
084: if (docStoreOffset != -1) {
085: // We read only a slice out of this shared fields file
086: this .docStoreOffset = docStoreOffset;
087: this .size = size;
088:
089: // Verify the file is long enough to hold all of our
090: // docs
091: assert ((int) (indexStream.length() / 8)) >= size
092: + this .docStoreOffset;
093: } else {
094: this .docStoreOffset = 0;
095: this .size = (int) (indexStream.length() >> 3);
096: }
097:
098: numTotalDocs = (int) (indexStream.length() >> 3);
099: success = true;
100: } finally {
101: // With lock-less commits, it's entirely possible (and
102: // fine) to hit a FileNotFound exception above. In
103: // this case, we want to explicitly close any subset
104: // of things that were opened so that we don't have to
105: // wait for a GC to do so.
106: if (!success) {
107: close();
108: }
109: }
110: }
111:
112: /**
113: * @throws AlreadyClosedException if this FieldsReader is closed
114: */
115: protected final void ensureOpen() throws AlreadyClosedException {
116: if (closed) {
117: throw new AlreadyClosedException(
118: "this FieldsReader is closed");
119: }
120: }
121:
122: /**
123: * Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a
124: * lazy implementation of a Field. This means that the Fields values will not be accessible.
125: *
126: * @throws IOException
127: */
128: final void close() throws IOException {
129: if (!closed) {
130: if (fieldsStream != null) {
131: fieldsStream.close();
132: }
133: if (cloneableFieldsStream != null) {
134: cloneableFieldsStream.close();
135: }
136: if (indexStream != null) {
137: indexStream.close();
138: }
139: IndexInput localFieldsStream = (IndexInput) fieldsStreamTL
140: .get();
141: if (localFieldsStream != null) {
142: localFieldsStream.close();
143: fieldsStreamTL.set(null);
144: }
145: closed = true;
146: }
147: }
148:
149: final int size() {
150: return size;
151: }
152:
153: final Document doc(int n, FieldSelector fieldSelector)
154: throws CorruptIndexException, IOException {
155: indexStream.seek((n + docStoreOffset) * 8L);
156: long position = indexStream.readLong();
157: fieldsStream.seek(position);
158:
159: Document doc = new Document();
160: int numFields = fieldsStream.readVInt();
161: for (int i = 0; i < numFields; i++) {
162: int fieldNumber = fieldsStream.readVInt();
163: FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
164: FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD
165: : fieldSelector.accept(fi.name);
166:
167: byte bits = fieldsStream.readByte();
168: assert bits <= FieldsWriter.FIELD_IS_COMPRESSED
169: + FieldsWriter.FIELD_IS_TOKENIZED
170: + FieldsWriter.FIELD_IS_BINARY;
171:
172: boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
173: boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
174: boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
175: //TODO: Find an alternative approach here if this list continues to grow beyond the
176: //list of 5 or 6 currently here. See Lucene 762 for discussion
177: if (acceptField.equals(FieldSelectorResult.LOAD)) {
178: addField(doc, fi, binary, compressed, tokenize);
179: } else if (acceptField
180: .equals(FieldSelectorResult.LOAD_FOR_MERGE)) {
181: addFieldForMerge(doc, fi, binary, compressed, tokenize);
182: } else if (acceptField
183: .equals(FieldSelectorResult.LOAD_AND_BREAK)) {
184: addField(doc, fi, binary, compressed, tokenize);
185: break;//Get out of this loop
186: } else if (acceptField
187: .equals(FieldSelectorResult.LAZY_LOAD)) {
188: addFieldLazy(doc, fi, binary, compressed, tokenize);
189: } else if (acceptField.equals(FieldSelectorResult.SIZE)) {
190: skipField(binary, compressed, addFieldSize(doc, fi,
191: binary, compressed));
192: } else if (acceptField
193: .equals(FieldSelectorResult.SIZE_AND_BREAK)) {
194: addFieldSize(doc, fi, binary, compressed);
195: break;
196: } else {
197: skipField(binary, compressed);
198: }
199: }
200:
201: return doc;
202: }
203:
204: /** Returns the length in bytes of each raw document in a
205: * contiguous range of length numDocs starting with
206: * startDocID. Returns the IndexInput (the fieldStream),
207: * already seeked to the starting point for startDocID.*/
208: final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs)
209: throws IOException {
210: indexStream.seek((docStoreOffset + startDocID) * 8L);
211: long startOffset = indexStream.readLong();
212: long lastOffset = startOffset;
213: int count = 0;
214: while (count < numDocs) {
215: final long offset;
216: final int docID = docStoreOffset + startDocID + count + 1;
217: assert docID <= numTotalDocs;
218: if (docID < numTotalDocs)
219: offset = indexStream.readLong();
220: else
221: offset = fieldsStream.length();
222: lengths[count++] = (int) (offset - lastOffset);
223: lastOffset = offset;
224: }
225:
226: fieldsStream.seek(startOffset);
227:
228: return fieldsStream;
229: }
230:
231: /**
232: * Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
233: * This will have the most payoff on large fields.
234: */
235: private void skipField(boolean binary, boolean compressed)
236: throws IOException {
237: skipField(binary, compressed, fieldsStream.readVInt());
238: }
239:
240: private void skipField(boolean binary, boolean compressed,
241: int toRead) throws IOException {
242: if (binary || compressed) {
243: long pointer = fieldsStream.getFilePointer();
244: fieldsStream.seek(pointer + toRead);
245: } else {
246: //We need to skip chars. This will slow us down, but still better
247: fieldsStream.skipChars(toRead);
248: }
249: }
250:
251: private void addFieldLazy(Document doc, FieldInfo fi,
252: boolean binary, boolean compressed, boolean tokenize)
253: throws IOException {
254: if (binary == true) {
255: int toRead = fieldsStream.readVInt();
256: long pointer = fieldsStream.getFilePointer();
257: if (compressed) {
258: //was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS));
259: doc.add(new LazyField(fi.name, Field.Store.COMPRESS,
260: toRead, pointer));
261: } else {
262: //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
263: doc.add(new LazyField(fi.name, Field.Store.YES, toRead,
264: pointer));
265: }
266: //Need to move the pointer ahead by toRead positions
267: fieldsStream.seek(pointer + toRead);
268: } else {
269: Field.Store store = Field.Store.YES;
270: Field.Index index = getIndexType(fi, tokenize);
271: Field.TermVector termVector = getTermVectorType(fi);
272:
273: Fieldable f;
274: if (compressed) {
275: store = Field.Store.COMPRESS;
276: int toRead = fieldsStream.readVInt();
277: long pointer = fieldsStream.getFilePointer();
278: f = new LazyField(fi.name, store, toRead, pointer);
279: //skip over the part that we aren't loading
280: fieldsStream.seek(pointer + toRead);
281: f.setOmitNorms(fi.omitNorms);
282: } else {
283: int length = fieldsStream.readVInt();
284: long pointer = fieldsStream.getFilePointer();
285: //Skip ahead of where we are by the length of what is stored
286: fieldsStream.skipChars(length);
287: f = new LazyField(fi.name, store, index, termVector,
288: length, pointer);
289: f.setOmitNorms(fi.omitNorms);
290: }
291: doc.add(f);
292: }
293:
294: }
295:
296: // in merge mode we don't uncompress the data of a compressed field
297: private void addFieldForMerge(Document doc, FieldInfo fi,
298: boolean binary, boolean compressed, boolean tokenize)
299: throws IOException {
300: Object data;
301:
302: if (binary || compressed) {
303: int toRead = fieldsStream.readVInt();
304: final byte[] b = new byte[toRead];
305: fieldsStream.readBytes(b, 0, b.length);
306: data = b;
307: } else {
308: data = fieldsStream.readString();
309: }
310:
311: doc.add(new FieldForMerge(data, fi, binary, compressed,
312: tokenize));
313: }
314:
315: private void addField(Document doc, FieldInfo fi, boolean binary,
316: boolean compressed, boolean tokenize)
317: throws CorruptIndexException, IOException {
318:
319: //we have a binary stored field, and it may be compressed
320: if (binary) {
321: int toRead = fieldsStream.readVInt();
322: final byte[] b = new byte[toRead];
323: fieldsStream.readBytes(b, 0, b.length);
324: if (compressed)
325: doc.add(new Field(fi.name, uncompress(b),
326: Field.Store.COMPRESS));
327: else
328: doc.add(new Field(fi.name, b, Field.Store.YES));
329:
330: } else {
331: Field.Store store = Field.Store.YES;
332: Field.Index index = getIndexType(fi, tokenize);
333: Field.TermVector termVector = getTermVectorType(fi);
334:
335: Fieldable f;
336: if (compressed) {
337: store = Field.Store.COMPRESS;
338: int toRead = fieldsStream.readVInt();
339:
340: final byte[] b = new byte[toRead];
341: fieldsStream.readBytes(b, 0, b.length);
342: f = new Field(fi.name, // field name
343: new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
344: store, index, termVector);
345: f.setOmitNorms(fi.omitNorms);
346: } else {
347: f = new Field(fi.name, // name
348: fieldsStream.readString(), // read value
349: store, index, termVector);
350: f.setOmitNorms(fi.omitNorms);
351: }
352: doc.add(f);
353: }
354: }
355:
356: // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
357: // Read just the size -- caller must skip the field content to continue reading fields
358: // Return the size in bytes or chars, depending on field type
359: private int addFieldSize(Document doc, FieldInfo fi,
360: boolean binary, boolean compressed) throws IOException {
361: int size = fieldsStream.readVInt(), bytesize = binary
362: || compressed ? size : 2 * size;
363: byte[] sizebytes = new byte[4];
364: sizebytes[0] = (byte) (bytesize >>> 24);
365: sizebytes[1] = (byte) (bytesize >>> 16);
366: sizebytes[2] = (byte) (bytesize >>> 8);
367: sizebytes[3] = (byte) bytesize;
368: doc.add(new Field(fi.name, sizebytes, Field.Store.YES));
369: return size;
370: }
371:
372: private Field.TermVector getTermVectorType(FieldInfo fi) {
373: Field.TermVector termVector = null;
374: if (fi.storeTermVector) {
375: if (fi.storeOffsetWithTermVector) {
376: if (fi.storePositionWithTermVector) {
377: termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
378: } else {
379: termVector = Field.TermVector.WITH_OFFSETS;
380: }
381: } else if (fi.storePositionWithTermVector) {
382: termVector = Field.TermVector.WITH_POSITIONS;
383: } else {
384: termVector = Field.TermVector.YES;
385: }
386: } else {
387: termVector = Field.TermVector.NO;
388: }
389: return termVector;
390: }
391:
392: private Field.Index getIndexType(FieldInfo fi, boolean tokenize) {
393: Field.Index index;
394: if (fi.isIndexed && tokenize)
395: index = Field.Index.TOKENIZED;
396: else if (fi.isIndexed && !tokenize)
397: index = Field.Index.UN_TOKENIZED;
398: else
399: index = Field.Index.NO;
400: return index;
401: }
402:
403: /**
404: * A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is
405: * loaded.
406: */
407: private class LazyField extends AbstractField implements Fieldable {
408: private int toRead;
409: private long pointer;
410:
411: public LazyField(String name, Field.Store store, int toRead,
412: long pointer) {
413: super (name, store, Field.Index.NO, Field.TermVector.NO);
414: this .toRead = toRead;
415: this .pointer = pointer;
416: lazy = true;
417: }
418:
419: public LazyField(String name, Field.Store store,
420: Field.Index index, Field.TermVector termVector,
421: int toRead, long pointer) {
422: super (name, store, index, termVector);
423: this .toRead = toRead;
424: this .pointer = pointer;
425: lazy = true;
426: }
427:
428: private IndexInput getFieldStream() {
429: IndexInput localFieldsStream = (IndexInput) fieldsStreamTL
430: .get();
431: if (localFieldsStream == null) {
432: localFieldsStream = (IndexInput) cloneableFieldsStream
433: .clone();
434: fieldsStreamTL.set(localFieldsStream);
435: }
436: return localFieldsStream;
437: }
438:
439: /** The value of the field in Binary, or null. If null, the Reader value,
440: * String value, or TokenStream value is used. Exactly one of stringValue(),
441: * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
442: public byte[] binaryValue() {
443: ensureOpen();
444: if (fieldsData == null) {
445: final byte[] b = new byte[toRead];
446: IndexInput localFieldsStream = getFieldStream();
447: //Throw this IO Exception since IndexREader.document does so anyway, so probably not that big of a change for people
448: //since they are already handling this exception when getting the document
449: try {
450: localFieldsStream.seek(pointer);
451: localFieldsStream.readBytes(b, 0, b.length);
452: if (isCompressed == true) {
453: fieldsData = uncompress(b);
454: } else {
455: fieldsData = b;
456: }
457: } catch (IOException e) {
458: throw new FieldReaderException(e);
459: }
460: }
461: return fieldsData instanceof byte[] ? (byte[]) fieldsData
462: : null;
463: }
464:
465: /** The value of the field as a Reader, or null. If null, the String value,
466: * binary value, or TokenStream value is used. Exactly one of stringValue(),
467: * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
468: public Reader readerValue() {
469: ensureOpen();
470: return fieldsData instanceof Reader ? (Reader) fieldsData
471: : null;
472: }
473:
474: /** The value of the field as a TokesStream, or null. If null, the Reader value,
475: * String value, or binary value is used. Exactly one of stringValue(),
476: * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
477: public TokenStream tokenStreamValue() {
478: ensureOpen();
479: return fieldsData instanceof TokenStream ? (TokenStream) fieldsData
480: : null;
481: }
482:
483: /** The value of the field as a String, or null. If null, the Reader value,
484: * binary value, or TokenStream value is used. Exactly one of stringValue(),
485: * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
486: public String stringValue() {
487: ensureOpen();
488: if (fieldsData == null) {
489: IndexInput localFieldsStream = getFieldStream();
490: try {
491: localFieldsStream.seek(pointer);
492: if (isCompressed) {
493: final byte[] b = new byte[toRead];
494: localFieldsStream.readBytes(b, 0, b.length);
495: fieldsData = new String(uncompress(b), "UTF-8");
496: } else {
497: //read in chars b/c we already know the length we need to read
498: char[] chars = new char[toRead];
499: localFieldsStream.readChars(chars, 0, toRead);
500: fieldsData = new String(chars);
501: }
502: } catch (IOException e) {
503: throw new FieldReaderException(e);
504: }
505: }
506: return fieldsData instanceof String ? (String) fieldsData
507: : null;
508: }
509:
510: public long getPointer() {
511: ensureOpen();
512: return pointer;
513: }
514:
515: public void setPointer(long pointer) {
516: ensureOpen();
517: this .pointer = pointer;
518: }
519:
520: public int getToRead() {
521: ensureOpen();
522: return toRead;
523: }
524:
525: public void setToRead(int toRead) {
526: ensureOpen();
527: this .toRead = toRead;
528: }
529: }
530:
531: private final byte[] uncompress(final byte[] input)
532: throws CorruptIndexException, IOException {
533:
534: Inflater decompressor = new Inflater();
535: decompressor.setInput(input);
536:
537: // Create an expandable byte array to hold the decompressed data
538: ByteArrayOutputStream bos = new ByteArrayOutputStream(
539: input.length);
540:
541: // Decompress the data
542: byte[] buf = new byte[1024];
543: while (!decompressor.finished()) {
544: try {
545: int count = decompressor.inflate(buf);
546: bos.write(buf, 0, count);
547: } catch (DataFormatException e) {
548: // this will happen if the field is not compressed
549: CorruptIndexException newException = new CorruptIndexException(
550: "field data are in wrong format: "
551: + e.toString());
552: newException.initCause(e);
553: throw newException;
554: }
555: }
556:
557: decompressor.end();
558:
559: // Get the decompressed data
560: return bos.toByteArray();
561: }
562:
563: // Instances of this class hold field properties and data
564: // for merge
565: final static class FieldForMerge extends AbstractField {
566: public String stringValue() {
567: return (String) this .fieldsData;
568: }
569:
570: public Reader readerValue() {
571: // not needed for merge
572: return null;
573: }
574:
575: public byte[] binaryValue() {
576: return (byte[]) this .fieldsData;
577: }
578:
579: public TokenStream tokenStreamValue() {
580: // not needed for merge
581: return null;
582: }
583:
584: public FieldForMerge(Object value, FieldInfo fi,
585: boolean binary, boolean compressed, boolean tokenize) {
586: this .isStored = true;
587: this.fieldsData = value;
588: this.isCompressed = compressed;
589: this.isBinary = binary;
590: this.isTokenized = tokenize;
591:
592: this.name = fi.name.intern();
593: this.isIndexed = fi.isIndexed;
594: this.omitNorms = fi.omitNorms;
595: this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector;
596: this.storePositionWithTermVector = fi.storePositionWithTermVector;
597: this.storeTermVector = fi.storeTermVector;
598: }
599:
600: }
601: }
|