0001: package org.apache.lucene.index;
0002:
0003: /**
0004: * Licensed to the Apache Software Foundation (ASF) under one or more
0005: * contributor license agreements. See the NOTICE file distributed with
0006: * this work for additional information regarding copyright ownership.
0007: * The ASF licenses this file to You under the Apache License, Version 2.0
0008: * (the "License"); you may not use this file except in compliance with
0009: * the License. You may obtain a copy of the License at
0010: *
0011: * http://www.apache.org/licenses/LICENSE-2.0
0012: *
0013: * Unless required by applicable law or agreed to in writing, software
0014: * distributed under the License is distributed on an "AS IS" BASIS,
0015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0016: * See the License for the specific language governing permissions and
0017: * limitations under the License.
0018: */
0019:
0020: import org.apache.lucene.document.Document;
0021: import org.apache.lucene.document.FieldSelector;
0022: import org.apache.lucene.search.Similarity;
0023: import org.apache.lucene.store.*;
0024:
0025: import java.io.File;
0026: import java.io.FileOutputStream;
0027: import java.io.IOException;
0028: import java.util.Arrays;
0029: import java.util.Collection;
0030:
0031: /** IndexReader is an abstract class, providing an interface for accessing an
0032: index. Search of an index is done entirely through this abstract interface,
0033: so that any subclass which implements it is searchable.
0034:
0035: <p> Concrete subclasses of IndexReader are usually constructed with a call to
0036: one of the static <code>open()</code> methods, e.g. {@link #open(String)}.
0037:
0038: <p> For efficiency, in this API documents are often referred to via
0039: <i>document numbers</i>, non-negative integers which each name a unique
0040: document in the index. These document numbers are ephemeral--they may change
0041: as documents are added to and deleted from an index. Clients should thus not
0042: rely on a given document having the same number between sessions.
0043:
0044: <p> An IndexReader can be opened on a directory for which an IndexWriter is
0045: opened already, but it cannot be used to delete documents from the index then.
0046:
0047: <p>
0048: NOTE: for backwards API compatibility, several methods are not listed
0049: as abstract, but have no useful implementations in this base class and
0050: instead always throw UnsupportedOperationException. Subclasses are
0051: strongly encouraged to override these methods, but in many cases may not
0052: need to.
0053: </p>
0054:
0055: @version $Id: IndexReader.java 598462 2007-11-26 23:31:39Z dnaber $
0056: */
0057: public abstract class IndexReader {
0058:
0059: /**
0060: * Constants describing field properties, for example used for
0061: * {@link IndexReader#getFieldNames(FieldOption)}.
0062: */
0063: public static final class FieldOption {
0064: private String option;
0065:
0066: private FieldOption() {
0067: }
0068:
0069: private FieldOption(String option) {
0070: this .option = option;
0071: }
0072:
0073: public String toString() {
0074: return this .option;
0075: }
0076:
0077: /** All fields */
0078: public static final FieldOption ALL = new FieldOption("ALL");
0079: /** All indexed fields */
0080: public static final FieldOption INDEXED = new FieldOption(
0081: "INDEXED");
0082: /** All fields that store payloads */
0083: public static final FieldOption STORES_PAYLOADS = new FieldOption(
0084: "STORES_PAYLOADS");
0085: /** All fields which are not indexed */
0086: public static final FieldOption UNINDEXED = new FieldOption(
0087: "UNINDEXED");
0088: /** All fields which are indexed with termvectors enabled */
0089: public static final FieldOption INDEXED_WITH_TERMVECTOR = new FieldOption(
0090: "INDEXED_WITH_TERMVECTOR");
0091: /** All fields which are indexed but don't have termvectors enabled */
0092: public static final FieldOption INDEXED_NO_TERMVECTOR = new FieldOption(
0093: "INDEXED_NO_TERMVECTOR");
0094: /** All fields with termvectors enabled. Please note that only standard termvector fields are returned */
0095: public static final FieldOption TERMVECTOR = new FieldOption(
0096: "TERMVECTOR");
0097: /** All fields with termvectors with position values enabled */
0098: public static final FieldOption TERMVECTOR_WITH_POSITION = new FieldOption(
0099: "TERMVECTOR_WITH_POSITION");
0100: /** All fields with termvectors with offset values enabled */
0101: public static final FieldOption TERMVECTOR_WITH_OFFSET = new FieldOption(
0102: "TERMVECTOR_WITH_OFFSET");
0103: /** All fields with termvectors with offset values and position values enabled */
0104: public static final FieldOption TERMVECTOR_WITH_POSITION_OFFSET = new FieldOption(
0105: "TERMVECTOR_WITH_POSITION_OFFSET");
0106: }
0107:
0108: private boolean closed;
0109: protected boolean hasChanges;
0110:
0111: private volatile int refCount;
0112:
0113: // for testing
0114: synchronized int getRefCount() {
0115: return refCount;
0116: }
0117:
0118: /**
0119: * Increments the refCount of this IndexReader instance. RefCounts are used to determine
0120: * when a reader can be closed safely, i. e. as soon as no other IndexReader is referencing
0121: * it anymore.
0122: */
0123: protected synchronized void incRef() {
0124: assert refCount > 0;
0125: refCount++;
0126: }
0127:
0128: /**
0129: * Decreases the refCount of this IndexReader instance. If the refCount drops
0130: * to 0, then pending changes are committed to the index and this reader is closed.
0131: *
0132: * @throws IOException in case an IOException occurs in commit() or doClose()
0133: */
0134: protected synchronized void decRef() throws IOException {
0135: assert refCount > 0;
0136: if (refCount == 1) {
0137: commit();
0138: doClose();
0139: }
0140: refCount--;
0141: }
0142:
0143: /**
0144: * @deprecated will be deleted when IndexReader(Directory) is deleted
0145: * @see #directory()
0146: */
0147: private Directory directory;
0148:
0149: /**
0150: * Legacy Constructor for backwards compatibility.
0151: *
0152: * <p>
0153: * This Constructor should not be used, it exists for backwards
0154: * compatibility only to support legacy subclasses that did not "own"
0155: * a specific directory, but needed to specify something to be returned
0156: * by the directory() method. Future subclasses should delegate to the
0157: * no arg constructor and implement the directory() method as appropriate.
0158: *
0159: * @param directory Directory to be returned by the directory() method
0160: * @see #directory()
0161: * @deprecated - use IndexReader()
0162: */
0163: protected IndexReader(Directory directory) {
0164: this ();
0165: this .directory = directory;
0166: }
0167:
0168: protected IndexReader() {
0169: refCount = 1;
0170: }
0171:
0172: /**
0173: * @throws AlreadyClosedException if this IndexReader is closed
0174: */
0175: protected final void ensureOpen() throws AlreadyClosedException {
0176: if (refCount <= 0) {
0177: throw new AlreadyClosedException(
0178: "this IndexReader is closed");
0179: }
0180: }
0181:
0182: /** Returns an IndexReader reading the index in an FSDirectory in the named
0183: path.
0184: * @throws CorruptIndexException if the index is corrupt
0185: * @throws IOException if there is a low-level IO error
0186: * @param path the path to the index directory */
0187: public static IndexReader open(String path)
0188: throws CorruptIndexException, IOException {
0189: return open(FSDirectory.getDirectory(path), true, null);
0190: }
0191:
0192: /** Returns an IndexReader reading the index in an FSDirectory in the named
0193: * path.
0194: * @param path the path to the index directory
0195: * @throws CorruptIndexException if the index is corrupt
0196: * @throws IOException if there is a low-level IO error
0197: */
0198: public static IndexReader open(File path)
0199: throws CorruptIndexException, IOException {
0200: return open(FSDirectory.getDirectory(path), true, null);
0201: }
0202:
0203: /** Returns an IndexReader reading the index in the given Directory.
0204: * @param directory the index directory
0205: * @throws CorruptIndexException if the index is corrupt
0206: * @throws IOException if there is a low-level IO error
0207: */
0208: public static IndexReader open(final Directory directory)
0209: throws CorruptIndexException, IOException {
0210: return open(directory, false, null);
0211: }
0212:
0213: /** Expert: returns an IndexReader reading the index in the given
0214: * Directory, with a custom {@link IndexDeletionPolicy}.
0215: * @param directory the index directory
0216: * @param deletionPolicy a custom deletion policy (only used
0217: * if you use this reader to perform deletes or to set
0218: * norms); see {@link IndexWriter} for details.
0219: * @throws CorruptIndexException if the index is corrupt
0220: * @throws IOException if there is a low-level IO error
0221: */
0222: public static IndexReader open(final Directory directory,
0223: IndexDeletionPolicy deletionPolicy)
0224: throws CorruptIndexException, IOException {
0225: return open(directory, false, deletionPolicy);
0226: }
0227:
0228: private static IndexReader open(final Directory directory,
0229: final boolean closeDirectory,
0230: final IndexDeletionPolicy deletionPolicy)
0231: throws CorruptIndexException, IOException {
0232: return DirectoryIndexReader.open(directory, closeDirectory,
0233: deletionPolicy);
0234: }
0235:
0236: /**
0237: * Refreshes an IndexReader if the index has changed since this instance
0238: * was (re)opened.
0239: * <p>
0240: * Opening an IndexReader is an expensive operation. This method can be used
0241: * to refresh an existing IndexReader to reduce these costs. This method
0242: * tries to only load segments that have changed or were created after the
0243: * IndexReader was (re)opened.
0244: * <p>
0245: * If the index has not changed since this instance was (re)opened, then this
0246: * call is a NOOP and returns this instance. Otherwise, a new instance is
0247: * returned. The old instance is <b>not</b> closed and remains usable.<br>
0248: * <b>Note:</b> The re-opened reader instance and the old instance might share
0249: * the same resources. For this reason no index modification operations
0250: * (e. g. {@link #deleteDocument(int)}, {@link #setNorm(int, String, byte)})
0251: * should be performed using one of the readers until the old reader instance
0252: * is closed. <b>Otherwise, the behavior of the readers is undefined.</b>
0253: * <p>
0254: * You can determine whether a reader was actually reopened by comparing the
0255: * old instance with the instance returned by this method:
0256: * <pre>
0257: * IndexReader reader = ...
0258: * ...
0259: * IndexReader new = r.reopen();
0260: * if (new != reader) {
0261: * ... // reader was reopened
0262: * reader.close();
0263: * }
0264: * reader = new;
0265: * ...
0266: * </pre>
0267: *
0268: * @throws CorruptIndexException if the index is corrupt
0269: * @throws IOException if there is a low-level IO error
0270: */
0271: public synchronized IndexReader reopen()
0272: throws CorruptIndexException, IOException {
0273: throw new UnsupportedOperationException(
0274: "This reader does not support reopen().");
0275: }
0276:
0277: /**
0278: * Returns the directory associated with this index. The Default
0279: * implementation returns the directory specified by subclasses when
0280: * delegating to the IndexReader(Directory) constructor, or throws an
0281: * UnsupportedOperationException if one was not specified.
0282: * @throws UnsupportedOperationException if no directory
0283: */
0284: public Directory directory() {
0285: ensureOpen();
0286: if (null != directory) {
0287: return directory;
0288: } else {
0289: throw new UnsupportedOperationException(
0290: "This reader does not support this method.");
0291: }
0292: }
0293:
0294: /**
0295: * Returns the time the index in the named directory was last modified.
0296: * Do not use this to check whether the reader is still up-to-date, use
0297: * {@link #isCurrent()} instead.
0298: * @throws CorruptIndexException if the index is corrupt
0299: * @throws IOException if there is a low-level IO error
0300: */
0301: public static long lastModified(String directory)
0302: throws CorruptIndexException, IOException {
0303: return lastModified(new File(directory));
0304: }
0305:
0306: /**
0307: * Returns the time the index in the named directory was last modified.
0308: * Do not use this to check whether the reader is still up-to-date, use
0309: * {@link #isCurrent()} instead.
0310: * @throws CorruptIndexException if the index is corrupt
0311: * @throws IOException if there is a low-level IO error
0312: */
0313: public static long lastModified(File fileDirectory)
0314: throws CorruptIndexException, IOException {
0315: return ((Long) new SegmentInfos.FindSegmentsFile(fileDirectory) {
0316: public Object doBody(String segmentFileName) {
0317: return new Long(FSDirectory.fileModified(fileDirectory,
0318: segmentFileName));
0319: }
0320: }.run()).longValue();
0321: }
0322:
0323: /**
0324: * Returns the time the index in the named directory was last modified.
0325: * Do not use this to check whether the reader is still up-to-date, use
0326: * {@link #isCurrent()} instead.
0327: * @throws CorruptIndexException if the index is corrupt
0328: * @throws IOException if there is a low-level IO error
0329: */
0330: public static long lastModified(final Directory directory2)
0331: throws CorruptIndexException, IOException {
0332: return ((Long) new SegmentInfos.FindSegmentsFile(directory2) {
0333: public Object doBody(String segmentFileName)
0334: throws IOException {
0335: return new Long(directory2
0336: .fileModified(segmentFileName));
0337: }
0338: }.run()).longValue();
0339: }
0340:
0341: /**
0342: * Reads version number from segments files. The version number is
0343: * initialized with a timestamp and then increased by one for each change of
0344: * the index.
0345: *
0346: * @param directory where the index resides.
0347: * @return version number.
0348: * @throws CorruptIndexException if the index is corrupt
0349: * @throws IOException if there is a low-level IO error
0350: */
0351: public static long getCurrentVersion(String directory)
0352: throws CorruptIndexException, IOException {
0353: return getCurrentVersion(new File(directory));
0354: }
0355:
0356: /**
0357: * Reads version number from segments files. The version number is
0358: * initialized with a timestamp and then increased by one for each change of
0359: * the index.
0360: *
0361: * @param directory where the index resides.
0362: * @return version number.
0363: * @throws CorruptIndexException if the index is corrupt
0364: * @throws IOException if there is a low-level IO error
0365: */
0366: public static long getCurrentVersion(File directory)
0367: throws CorruptIndexException, IOException {
0368: Directory dir = FSDirectory.getDirectory(directory);
0369: long version = getCurrentVersion(dir);
0370: dir.close();
0371: return version;
0372: }
0373:
0374: /**
0375: * Reads version number from segments files. The version number is
0376: * initialized with a timestamp and then increased by one for each change of
0377: * the index.
0378: *
0379: * @param directory where the index resides.
0380: * @return version number.
0381: * @throws CorruptIndexException if the index is corrupt
0382: * @throws IOException if there is a low-level IO error
0383: */
0384: public static long getCurrentVersion(Directory directory)
0385: throws CorruptIndexException, IOException {
0386: return SegmentInfos.readCurrentVersion(directory);
0387: }
0388:
0389: /**
0390: * Version number when this IndexReader was opened. Not implemented in the IndexReader base class.
0391: * @throws UnsupportedOperationException unless overridden in subclass
0392: */
0393: public long getVersion() {
0394: throw new UnsupportedOperationException(
0395: "This reader does not support this method.");
0396: }
0397:
0398: /**<p>For IndexReader implementations that use
0399: * TermInfosReader to read terms, this sets the
0400: * indexDivisor to subsample the number of indexed terms
0401: * loaded into memory. This has the same effect as {@link
0402: * IndexWriter#setTermIndexInterval} except that setting
0403: * must be done at indexing time while this setting can be
0404: * set per reader. When set to N, then one in every
0405: * N*termIndexInterval terms in the index is loaded into
0406: * memory. By setting this to a value > 1 you can reduce
0407: * memory usage, at the expense of higher latency when
0408: * loading a TermInfo. The default value is 1.</p>
0409: *
0410: * <b>NOTE:</b> you must call this before the term
0411: * index is loaded. If the index is already loaded,
0412: * an IllegalStateException is thrown.
0413: * @throws IllegalStateException if the term index has already been loaded into memory
0414: */
0415: public void setTermInfosIndexDivisor(int indexDivisor)
0416: throws IllegalStateException {
0417: throw new UnsupportedOperationException(
0418: "This reader does not support this method.");
0419: }
0420:
0421: /** <p>For IndexReader implementations that use
0422: * TermInfosReader to read terms, this returns the
0423: * current indexDivisor.
0424: * @see #setTermInfosIndexDivisor */
0425: public int getTermInfosIndexDivisor() {
0426: throw new UnsupportedOperationException(
0427: "This reader does not support this method.");
0428: }
0429:
0430: /**
0431: * Check whether this IndexReader is still using the
0432: * current (i.e., most recently committed) version of the
0433: * index. If a writer has committed any changes to the
0434: * index since this reader was opened, this will return
0435: * <code>false</code>, in which case you must open a new
0436: * IndexReader in order to see the changes. See the
0437: * description of the <a href="IndexWriter.html#autoCommit"><code>autoCommit</code></a>
0438: * flag which controls when the {@link IndexWriter}
0439: * actually commits changes to the index.
0440: *
0441: * <p>
0442: * Not implemented in the IndexReader base class.
0443: * </p>
0444: * @throws CorruptIndexException if the index is corrupt
0445: * @throws IOException if there is a low-level IO error
0446: * @throws UnsupportedOperationException unless overridden in subclass
0447: */
0448: public boolean isCurrent() throws CorruptIndexException,
0449: IOException {
0450: throw new UnsupportedOperationException(
0451: "This reader does not support this method.");
0452: }
0453:
0454: /**
0455: * Checks is the index is optimized (if it has a single segment and
0456: * no deletions). Not implemented in the IndexReader base class.
0457: * @return <code>true</code> if the index is optimized; <code>false</code> otherwise
0458: * @throws UnsupportedOperationException unless overridden in subclass
0459: */
0460: public boolean isOptimized() {
0461: throw new UnsupportedOperationException(
0462: "This reader does not support this method.");
0463: }
0464:
0465: /**
0466: * Return an array of term frequency vectors for the specified document.
0467: * The array contains a vector for each vectorized field in the document.
0468: * Each vector contains terms and frequencies for all terms in a given vectorized field.
0469: * If no such fields existed, the method returns null. The term vectors that are
0470: * returned my either be of type TermFreqVector or of type TermPositionsVector if
0471: * positions or offsets have been stored.
0472: *
0473: * @param docNumber document for which term frequency vectors are returned
0474: * @return array of term frequency vectors. May be null if no term vectors have been
0475: * stored for the specified document.
0476: * @throws IOException if index cannot be accessed
0477: * @see org.apache.lucene.document.Field.TermVector
0478: */
0479: abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
0480: throws IOException;
0481:
0482: /**
0483: * Return a term frequency vector for the specified document and field. The
0484: * returned vector contains terms and frequencies for the terms in
0485: * the specified field of this document, if the field had the storeTermVector
0486: * flag set. If termvectors had been stored with positions or offsets, a
0487: * TermPositionsVector is returned.
0488: *
0489: * @param docNumber document for which the term frequency vector is returned
0490: * @param field field for which the term frequency vector is returned.
0491: * @return term frequency vector May be null if field does not exist in the specified
0492: * document or term vector was not stored.
0493: * @throws IOException if index cannot be accessed
0494: * @see org.apache.lucene.document.Field.TermVector
0495: */
0496: abstract public TermFreqVector getTermFreqVector(int docNumber,
0497: String field) throws IOException;
0498:
0499: /**
0500: * Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
0501: * the {@link TermFreqVector}.
0502: * @param docNumber The number of the document to load the vector for
0503: * @param field The name of the field to load
0504: * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
0505: * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
0506: *
0507: */
0508: abstract public void getTermFreqVector(int docNumber, String field,
0509: TermVectorMapper mapper) throws IOException;
0510:
0511: /**
0512: * Map all the term vectors for all fields in a Document
0513: * @param docNumber The number of the document to load the vector for
0514: * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
0515: * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
0516: */
0517: abstract public void getTermFreqVector(int docNumber,
0518: TermVectorMapper mapper) throws IOException;
0519:
0520: /**
0521: * Returns <code>true</code> if an index exists at the specified directory.
0522: * If the directory does not exist or if there is no index in it.
0523: * <code>false</code> is returned.
0524: * @param directory the directory to check for an index
0525: * @return <code>true</code> if an index exists; <code>false</code> otherwise
0526: */
0527: public static boolean indexExists(String directory) {
0528: return indexExists(new File(directory));
0529: }
0530:
0531: /**
0532: * Returns <code>true</code> if an index exists at the specified directory.
0533: * If the directory does not exist or if there is no index in it.
0534: * @param directory the directory to check for an index
0535: * @return <code>true</code> if an index exists; <code>false</code> otherwise
0536: */
0537:
0538: public static boolean indexExists(File directory) {
0539: return SegmentInfos.getCurrentSegmentGeneration(directory
0540: .list()) != -1;
0541: }
0542:
0543: /**
0544: * Returns <code>true</code> if an index exists at the specified directory.
0545: * If the directory does not exist or if there is no index in it.
0546: * @param directory the directory to check for an index
0547: * @return <code>true</code> if an index exists; <code>false</code> otherwise
0548: * @throws IOException if there is a problem with accessing the index
0549: */
0550: public static boolean indexExists(Directory directory)
0551: throws IOException {
0552: return SegmentInfos.getCurrentSegmentGeneration(directory) != -1;
0553: }
0554:
0555: /** Returns the number of documents in this index. */
0556: public abstract int numDocs();
0557:
0558: /** Returns one greater than the largest possible document number.
0559: * This may be used to, e.g., determine how big to allocate an array which
0560: * will have an element for every document number in an index.
0561: */
0562: public abstract int maxDoc();
0563:
0564: /** Returns the stored fields of the <code>n</code><sup>th</sup>
0565: <code>Document</code> in this index.
0566: * @throws CorruptIndexException if the index is corrupt
0567: * @throws IOException if there is a low-level IO error
0568: */
0569: public Document document(int n) throws CorruptIndexException,
0570: IOException {
0571: ensureOpen();
0572: return document(n, null);
0573: }
0574:
0575: /**
0576: * Get the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup> position. The {@link org.apache.lucene.document.FieldSelector}
0577: * may be used to determine what {@link org.apache.lucene.document.Field}s to load and how they should be loaded.
0578: *
0579: * <b>NOTE:</b> If this Reader (more specifically, the underlying <code>FieldsReader</code>) is closed before the lazy {@link org.apache.lucene.document.Field} is
0580: * loaded an exception may be thrown. If you want the value of a lazy {@link org.apache.lucene.document.Field} to be available after closing you must
0581: * explicitly load it or fetch the Document again with a new loader.
0582: *
0583: *
0584: * @param n Get the document at the <code>n</code><sup>th</sup> position
0585: * @param fieldSelector The {@link org.apache.lucene.document.FieldSelector} to use to determine what Fields should be loaded on the Document. May be null, in which case all Fields will be loaded.
0586: * @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position
0587: * @throws CorruptIndexException if the index is corrupt
0588: * @throws IOException if there is a low-level IO error
0589: *
0590: * @see org.apache.lucene.document.Fieldable
0591: * @see org.apache.lucene.document.FieldSelector
0592: * @see org.apache.lucene.document.SetBasedFieldSelector
0593: * @see org.apache.lucene.document.LoadFirstFieldSelector
0594: */
0595: //When we convert to JDK 1.5 make this Set<String>
0596: public abstract Document document(int n, FieldSelector fieldSelector)
0597: throws CorruptIndexException, IOException;
0598:
0599: /** Returns true if document <i>n</i> has been deleted */
0600: public abstract boolean isDeleted(int n);
0601:
0602: /** Returns true if any documents have been deleted */
0603: public abstract boolean hasDeletions();
0604:
0605: /** Returns true if there are norms stored for this field. */
0606: public boolean hasNorms(String field) throws IOException {
0607: // backward compatible implementation.
0608: // SegmentReader has an efficient implementation.
0609: ensureOpen();
0610: return norms(field) != null;
0611: }
0612:
0613: /** Returns the byte-encoded normalization factor for the named field of
0614: * every document. This is used by the search code to score documents.
0615: *
0616: * @see org.apache.lucene.document.Field#setBoost(float)
0617: */
0618: public abstract byte[] norms(String field) throws IOException;
0619:
0620: /** Reads the byte-encoded normalization factor for the named field of every
0621: * document. This is used by the search code to score documents.
0622: *
0623: * @see org.apache.lucene.document.Field#setBoost(float)
0624: */
0625: public abstract void norms(String field, byte[] bytes, int offset)
0626: throws IOException;
0627:
0628: /** Expert: Resets the normalization factor for the named field of the named
0629: * document. The norm represents the product of the field's {@link
0630: * org.apache.lucene.document.Fieldable#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
0631: * int) length normalization}. Thus, to preserve the length normalization
0632: * values when resetting this, one should base the new value upon the old.
0633: *
0634: * @see #norms(String)
0635: * @see Similarity#decodeNorm(byte)
0636: * @throws StaleReaderException if the index has changed
0637: * since this reader was opened
0638: * @throws CorruptIndexException if the index is corrupt
0639: * @throws LockObtainFailedException if another writer
0640: * has this index open (<code>write.lock</code> could not
0641: * be obtained)
0642: * @throws IOException if there is a low-level IO error
0643: */
0644: public final synchronized void setNorm(int doc, String field,
0645: byte value) throws StaleReaderException,
0646: CorruptIndexException, LockObtainFailedException,
0647: IOException {
0648: ensureOpen();
0649: acquireWriteLock();
0650: hasChanges = true;
0651: doSetNorm(doc, field, value);
0652: }
0653:
0654: /** Implements setNorm in subclass.*/
0655: protected abstract void doSetNorm(int doc, String field, byte value)
0656: throws CorruptIndexException, IOException;
0657:
0658: /** Expert: Resets the normalization factor for the named field of the named
0659: * document.
0660: *
0661: * @see #norms(String)
0662: * @see Similarity#decodeNorm(byte)
0663: *
0664: * @throws StaleReaderException if the index has changed
0665: * since this reader was opened
0666: * @throws CorruptIndexException if the index is corrupt
0667: * @throws LockObtainFailedException if another writer
0668: * has this index open (<code>write.lock</code> could not
0669: * be obtained)
0670: * @throws IOException if there is a low-level IO error
0671: */
0672: public void setNorm(int doc, String field, float value)
0673: throws StaleReaderException, CorruptIndexException,
0674: LockObtainFailedException, IOException {
0675: ensureOpen();
0676: setNorm(doc, field, Similarity.encodeNorm(value));
0677: }
0678:
0679: /** Returns an enumeration of all the terms in the index. The
0680: * enumeration is ordered by Term.compareTo(). Each term is greater
0681: * than all that precede it in the enumeration. Note that after
0682: * calling terms(), {@link TermEnum#next()} must be called
0683: * on the resulting enumeration before calling other methods such as
0684: * {@link TermEnum#term()}.
0685: * @throws IOException if there is a low-level IO error
0686: */
0687: public abstract TermEnum terms() throws IOException;
0688:
0689: /** Returns an enumeration of all terms starting at a given term. If
0690: * the given term does not exist, the enumeration is positioned at the
0691: * first term greater than the supplied therm. The enumeration is
0692: * ordered by Term.compareTo(). Each term is greater than all that
0693: * precede it in the enumeration.
0694: * @throws IOException if there is a low-level IO error
0695: */
0696: public abstract TermEnum terms(Term t) throws IOException;
0697:
0698: /** Returns the number of documents containing the term <code>t</code>.
0699: * @throws IOException if there is a low-level IO error
0700: */
0701: public abstract int docFreq(Term t) throws IOException;
0702:
0703: /** Returns an enumeration of all the documents which contain
0704: * <code>term</code>. For each document, the document number, the frequency of
0705: * the term in that document is also provided, for use in search scoring.
0706: * Thus, this method implements the mapping:
0707: * <p><ul>
0708: * Term => <docNum, freq><sup>*</sup>
0709: * </ul>
0710: * <p>The enumeration is ordered by document number. Each document number
0711: * is greater than all that precede it in the enumeration.
0712: * @throws IOException if there is a low-level IO error
0713: */
0714: public TermDocs termDocs(Term term) throws IOException {
0715: ensureOpen();
0716: TermDocs termDocs = termDocs();
0717: termDocs.seek(term);
0718: return termDocs;
0719: }
0720:
0721: /** Returns an unpositioned {@link TermDocs} enumerator.
0722: * @throws IOException if there is a low-level IO error
0723: */
0724: public abstract TermDocs termDocs() throws IOException;
0725:
0726: /** Returns an enumeration of all the documents which contain
0727: * <code>term</code>. For each document, in addition to the document number
0728: * and frequency of the term in that document, a list of all of the ordinal
0729: * positions of the term in the document is available. Thus, this method
0730: * implements the mapping:
0731: *
0732: * <p><ul>
0733: * Term => <docNum, freq,
0734: * <pos<sub>1</sub>, pos<sub>2</sub>, ...
0735: * pos<sub>freq-1</sub>>
0736: * ><sup>*</sup>
0737: * </ul>
0738: * <p> This positional information facilitates phrase and proximity searching.
0739: * <p>The enumeration is ordered by document number. Each document number is
0740: * greater than all that precede it in the enumeration.
0741: * @throws IOException if there is a low-level IO error
0742: */
0743: public TermPositions termPositions(Term term) throws IOException {
0744: ensureOpen();
0745: TermPositions termPositions = termPositions();
0746: termPositions.seek(term);
0747: return termPositions;
0748: }
0749:
0750: /** Returns an unpositioned {@link TermPositions} enumerator.
0751: * @throws IOException if there is a low-level IO error
0752: */
0753: public abstract TermPositions termPositions() throws IOException;
0754:
0755: /** Deletes the document numbered <code>docNum</code>. Once a document is
0756: * deleted it will not appear in TermDocs or TermPostitions enumerations.
0757: * Attempts to read its field with the {@link #document}
0758: * method will result in an error. The presence of this document may still be
0759: * reflected in the {@link #docFreq} statistic, though
0760: * this will be corrected eventually as the index is further modified.
0761: *
0762: * @throws StaleReaderException if the index has changed
0763: * since this reader was opened
0764: * @throws CorruptIndexException if the index is corrupt
0765: * @throws LockObtainFailedException if another writer
0766: * has this index open (<code>write.lock</code> could not
0767: * be obtained)
0768: * @throws IOException if there is a low-level IO error
0769: */
0770: public final synchronized void deleteDocument(int docNum)
0771: throws StaleReaderException, CorruptIndexException,
0772: LockObtainFailedException, IOException {
0773: ensureOpen();
0774: acquireWriteLock();
0775: hasChanges = true;
0776: doDelete(docNum);
0777: }
0778:
0779: /** Implements deletion of the document numbered <code>docNum</code>.
0780: * Applications should call {@link #deleteDocument(int)} or {@link #deleteDocuments(Term)}.
0781: */
0782: protected abstract void doDelete(int docNum)
0783: throws CorruptIndexException, IOException;
0784:
0785: /** Deletes all documents that have a given <code>term</code> indexed.
0786: * This is useful if one uses a document field to hold a unique ID string for
0787: * the document. Then to delete such a document, one merely constructs a
0788: * term with the appropriate field and the unique ID string as its text and
0789: * passes it to this method.
0790: * See {@link #deleteDocument(int)} for information about when this deletion will
0791: * become effective.
0792: *
0793: * @return the number of documents deleted
0794: * @throws StaleReaderException if the index has changed
0795: * since this reader was opened
0796: * @throws CorruptIndexException if the index is corrupt
0797: * @throws LockObtainFailedException if another writer
0798: * has this index open (<code>write.lock</code> could not
0799: * be obtained)
0800: * @throws IOException if there is a low-level IO error
0801: */
0802: public final int deleteDocuments(Term term)
0803: throws StaleReaderException, CorruptIndexException,
0804: LockObtainFailedException, IOException {
0805: ensureOpen();
0806: TermDocs docs = termDocs(term);
0807: if (docs == null)
0808: return 0;
0809: int n = 0;
0810: try {
0811: while (docs.next()) {
0812: deleteDocument(docs.doc());
0813: n++;
0814: }
0815: } finally {
0816: docs.close();
0817: }
0818: return n;
0819: }
0820:
0821: /** Undeletes all documents currently marked as deleted in this index.
0822: *
0823: * @throws StaleReaderException if the index has changed
0824: * since this reader was opened
0825: * @throws LockObtainFailedException if another writer
0826: * has this index open (<code>write.lock</code> could not
0827: * be obtained)
0828: * @throws CorruptIndexException if the index is corrupt
0829: * @throws IOException if there is a low-level IO error
0830: */
0831: public final synchronized void undeleteAll()
0832: throws StaleReaderException, CorruptIndexException,
0833: LockObtainFailedException, IOException {
0834: ensureOpen();
0835: acquireWriteLock();
0836: hasChanges = true;
0837: doUndeleteAll();
0838: }
0839:
0840: /** Implements actual undeleteAll() in subclass. */
0841: protected abstract void doUndeleteAll()
0842: throws CorruptIndexException, IOException;
0843:
0844: /** Does nothing by default. Subclasses that require a write lock for
0845: * index modifications must implement this method. */
0846: protected synchronized void acquireWriteLock() throws IOException {
0847: /* NOOP */
0848: }
0849:
0850: /**
0851: *
0852: * @throws IOException
0853: */
0854: public final synchronized void flush() throws IOException {
0855: ensureOpen();
0856: commit();
0857: }
0858:
0859: /**
0860: * Commit changes resulting from delete, undeleteAll, or
0861: * setNorm operations
0862: *
0863: * If an exception is hit, then either no changes or all
0864: * changes will have been committed to the index
0865: * (transactional semantics).
0866: * @throws IOException if there is a low-level IO error
0867: */
0868: protected final synchronized void commit() throws IOException {
0869: if (hasChanges) {
0870: doCommit();
0871: }
0872: hasChanges = false;
0873: }
0874:
0875: /** Implements commit. */
0876: protected abstract void doCommit() throws IOException;
0877:
0878: /**
0879: * Closes files associated with this index.
0880: * Also saves any new deletions to disk.
0881: * No other methods should be called after this has been called.
0882: * @throws IOException if there is a low-level IO error
0883: */
0884: public final synchronized void close() throws IOException {
0885: if (!closed) {
0886: decRef();
0887: closed = true;
0888: }
0889: }
0890:
0891: /** Implements close. */
0892: protected abstract void doClose() throws IOException;
0893:
0894: /**
0895: * Get a list of unique field names that exist in this index and have the specified
0896: * field option information.
0897: * @param fldOption specifies which field option should be available for the returned fields
0898: * @return Collection of Strings indicating the names of the fields.
0899: * @see IndexReader.FieldOption
0900: */
0901: public abstract Collection getFieldNames(FieldOption fldOption);
0902:
0903: /**
0904: * Returns <code>true</code> iff the index in the named directory is
0905: * currently locked.
0906: * @param directory the directory to check for a lock
0907: * @throws IOException if there is a low-level IO error
0908: */
0909: public static boolean isLocked(Directory directory)
0910: throws IOException {
0911: return directory.makeLock(IndexWriter.WRITE_LOCK_NAME)
0912: .isLocked();
0913: }
0914:
0915: /**
0916: * Returns <code>true</code> iff the index in the named directory is
0917: * currently locked.
0918: * @param directory the directory to check for a lock
0919: * @throws IOException if there is a low-level IO error
0920: */
0921: public static boolean isLocked(String directory) throws IOException {
0922: Directory dir = FSDirectory.getDirectory(directory);
0923: boolean result = isLocked(dir);
0924: dir.close();
0925: return result;
0926: }
0927:
0928: /**
0929: * Forcibly unlocks the index in the named directory.
0930: * <P>
0931: * Caution: this should only be used by failure recovery code,
0932: * when it is known that no other process nor thread is in fact
0933: * currently accessing this index.
0934: */
0935: public static void unlock(Directory directory) throws IOException {
0936: directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release();
0937: }
0938:
0939: /**
0940: * Prints the filename and size of each file within a given compound file.
0941: * Add the -extract flag to extract files to the current working directory.
0942: * In order to make the extracted version of the index work, you have to copy
0943: * the segments file from the compound index into the directory where the extracted files are stored.
0944: * @param args Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile>
0945: */
0946: public static void main(String[] args) {
0947: String filename = null;
0948: boolean extract = false;
0949:
0950: for (int i = 0; i < args.length; ++i) {
0951: if (args[i].equals("-extract")) {
0952: extract = true;
0953: } else if (filename == null) {
0954: filename = args[i];
0955: }
0956: }
0957:
0958: if (filename == null) {
0959: System.out
0960: .println("Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile>");
0961: return;
0962: }
0963:
0964: Directory dir = null;
0965: CompoundFileReader cfr = null;
0966:
0967: try {
0968: File file = new File(filename);
0969: String dirname = file.getAbsoluteFile().getParent();
0970: filename = file.getName();
0971: dir = FSDirectory.getDirectory(dirname);
0972: cfr = new CompoundFileReader(dir, filename);
0973:
0974: String[] files = cfr.list();
0975: Arrays.sort(files); // sort the array of filename so that the output is more readable
0976:
0977: for (int i = 0; i < files.length; ++i) {
0978: long len = cfr.fileLength(files[i]);
0979:
0980: if (extract) {
0981: System.out.println("extract " + files[i] + " with "
0982: + len + " bytes to local directory...");
0983: IndexInput ii = cfr.openInput(files[i]);
0984:
0985: FileOutputStream f = new FileOutputStream(files[i]);
0986:
0987: // read and write with a small buffer, which is more effectiv than reading byte by byte
0988: byte[] buffer = new byte[1024];
0989: int chunk = buffer.length;
0990: while (len > 0) {
0991: final int bufLen = (int) Math.min(chunk, len);
0992: ii.readBytes(buffer, 0, bufLen);
0993: f.write(buffer, 0, bufLen);
0994: len -= bufLen;
0995: }
0996:
0997: f.close();
0998: ii.close();
0999: } else
1000: System.out
1001: .println(files[i] + ": " + len + " bytes");
1002: }
1003: } catch (IOException ioe) {
1004: ioe.printStackTrace();
1005: } finally {
1006: try {
1007: if (dir != null)
1008: dir.close();
1009: if (cfr != null)
1010: cfr.close();
1011: } catch (IOException ioe) {
1012: ioe.printStackTrace();
1013: }
1014: }
1015: }
1016: }
|