001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.store.FSDirectory;
021: import org.apache.lucene.store.Directory;
022: import org.apache.lucene.store.IndexInput;
023: import org.apache.lucene.document.Document;
024:
025: import java.text.NumberFormat;
026: import java.io.PrintStream;
027: import java.io.IOException;
028: import java.util.Collection;
029: import java.util.Iterator;
030:
031: /**
032: * Basic tool to check the health of an index and write a
033: * new segments file that removes reference to problematic
034: * segments. There are many more checks that this tool
035: * could do but does not yet, eg: reconstructing a segments
036: * file by looking for all loadable segments (if no segments
037: * file is found), removing specifically specified segments,
038: * listing files that exist but are not referenced, etc.
039: */
040:
041: public class CheckIndex {
042:
043: public static PrintStream out = System.out;
044:
045: private static class MySegmentTermDocs extends SegmentTermDocs {
046:
047: int delCount;
048:
049: MySegmentTermDocs(SegmentReader p) {
050: super (p);
051: }
052:
053: public void seek(Term term) throws IOException {
054: super .seek(term);
055: delCount = 0;
056: }
057:
058: protected void skippingDoc() throws IOException {
059: delCount++;
060: }
061: }
062:
063: /** Returns true if index is clean, else false.*/
064: public static boolean check(Directory dir, boolean doFix)
065: throws IOException {
066: NumberFormat nf = NumberFormat.getInstance();
067: SegmentInfos sis = new SegmentInfos();
068:
069: try {
070: sis.read(dir);
071: } catch (Throwable t) {
072: out
073: .println("ERROR: could not read any segments file in directory");
074: t.printStackTrace(out);
075: return false;
076: }
077:
078: final int numSegments = sis.size();
079: final String segmentsFileName = sis.getCurrentSegmentFileName();
080: IndexInput input = null;
081: try {
082: input = dir.openInput(segmentsFileName);
083: } catch (Throwable t) {
084: out
085: .println("ERROR: could not open segments file in directory");
086: t.printStackTrace(out);
087: return false;
088: }
089: int format = 0;
090: try {
091: format = input.readInt();
092: } catch (Throwable t) {
093: out
094: .println("ERROR: could not read segment file version in directory");
095: t.printStackTrace(out);
096: return false;
097: } finally {
098: if (input != null)
099: input.close();
100: }
101:
102: String sFormat = "";
103: boolean skip = false;
104:
105: if (format == SegmentInfos.FORMAT)
106: sFormat = "FORMAT [Lucene Pre-2.1]";
107: if (format == SegmentInfos.FORMAT_LOCKLESS)
108: sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
109: else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
110: sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
111: else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
112: sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
113: else if (format < SegmentInfos.FORMAT_SHARED_DOC_STORE) {
114: sFormat = "int=" + format
115: + " [newer version of Lucene than this tool]";
116: skip = true;
117: } else {
118: sFormat = format + " [Lucene 1.3 or prior]";
119: }
120:
121: out
122: .println("Segments file=" + segmentsFileName
123: + " numSegments=" + numSegments + " version="
124: + sFormat);
125:
126: if (skip) {
127: out
128: .println("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
129: return false;
130: }
131:
132: SegmentInfos newSIS = (SegmentInfos) sis.clone();
133: newSIS.clear();
134: boolean changed = false;
135: int totLoseDocCount = 0;
136: int numBadSegments = 0;
137: for (int i = 0; i < numSegments; i++) {
138: final SegmentInfo info = sis.info(i);
139: out.println(" " + (1 + i) + " of " + numSegments
140: + ": name=" + info.name + " docCount="
141: + info.docCount);
142: int toLoseDocCount = info.docCount;
143:
144: SegmentReader reader = null;
145:
146: try {
147: out
148: .println(" compound="
149: + info.getUseCompoundFile());
150: out.println(" numFiles=" + info.files().size());
151: out.println(" size (MB)="
152: + nf.format(info.sizeInBytes()
153: / (1024. * 1024.)));
154: final int docStoreOffset = info.getDocStoreOffset();
155: if (docStoreOffset != -1) {
156: out.println(" docStoreOffset=" + docStoreOffset);
157: out.println(" docStoreSegment="
158: + info.getDocStoreSegment());
159: out.println(" docStoreIsCompoundFile="
160: + info.getDocStoreIsCompoundFile());
161: }
162: final String delFileName = info.getDelFileName();
163: if (delFileName == null)
164: out.println(" no deletions");
165: else
166: out.println(" has deletions [delFileName="
167: + delFileName + "]");
168: out.print(" test: open reader.........");
169: reader = SegmentReader.get(info);
170: final int numDocs = reader.numDocs();
171: toLoseDocCount = numDocs;
172: if (reader.hasDeletions())
173: out.println("OK [" + (info.docCount - numDocs)
174: + " deleted docs]");
175: else
176: out.println("OK");
177:
178: out.print(" test: fields, norms.......");
179: Collection fieldNames = reader
180: .getFieldNames(IndexReader.FieldOption.ALL);
181: Iterator it = fieldNames.iterator();
182: while (it.hasNext()) {
183: final String fieldName = (String) it.next();
184: byte[] b = reader.norms(fieldName);
185: if (b.length != info.docCount)
186: throw new RuntimeException("norms for field \""
187: + fieldName + "\" is length "
188: + b.length + " != maxDoc "
189: + info.docCount);
190:
191: }
192: out.println("OK [" + fieldNames.size() + " fields]");
193:
194: out.print(" test: terms, freq, prox...");
195: final TermEnum termEnum = reader.terms();
196: final TermPositions termPositions = reader
197: .termPositions();
198:
199: // Used only to count up # deleted docs for this
200: // term
201: final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(
202: reader);
203:
204: long termCount = 0;
205: long totFreq = 0;
206: long totPos = 0;
207: while (termEnum.next()) {
208: termCount++;
209: final Term term = termEnum.term();
210: final int docFreq = termEnum.docFreq();
211: termPositions.seek(term);
212: int lastDoc = -1;
213: int freq0 = 0;
214: totFreq += docFreq;
215: while (termPositions.next()) {
216: freq0++;
217: final int doc = termPositions.doc();
218: final int freq = termPositions.freq();
219: if (doc <= lastDoc)
220: throw new RuntimeException("term " + term
221: + ": doc " + doc + " < lastDoc "
222: + lastDoc);
223: lastDoc = doc;
224: if (freq <= 0)
225: throw new RuntimeException("term " + term
226: + ": doc " + doc + ": freq " + freq
227: + " is out of bounds");
228:
229: int lastPos = -1;
230: totPos += freq;
231: for (int j = 0; j < freq; j++) {
232: final int pos = termPositions
233: .nextPosition();
234: if (pos < 0)
235: throw new RuntimeException("term "
236: + term + ": doc " + doc
237: + ": pos " + pos
238: + " is out of bounds");
239: if (pos <= lastPos)
240: throw new RuntimeException("term "
241: + term + ": doc " + doc
242: + ": pos " + pos
243: + " < lastPos " + lastPos);
244: }
245: }
246:
247: // Now count how many deleted docs occurred in
248: // this term:
249: final int delCount;
250: if (reader.hasDeletions()) {
251: myTermDocs.seek(term);
252: while (myTermDocs.next()) {
253: }
254: delCount = myTermDocs.delCount;
255: } else
256: delCount = 0;
257:
258: if (freq0 + delCount != docFreq)
259: throw new RuntimeException("term " + term
260: + " docFreq=" + docFreq
261: + " != num docs seen " + freq0
262: + " + num docs deleted " + delCount);
263: }
264:
265: out.println("OK [" + termCount + " terms; " + totFreq
266: + " terms/docs pairs; " + totPos + " tokens]");
267:
268: out.print(" test: stored fields.......");
269: int docCount = 0;
270: long totFields = 0;
271: for (int j = 0; j < info.docCount; j++)
272: if (!reader.isDeleted(j)) {
273: docCount++;
274: Document doc = reader.document(j);
275: totFields += doc.getFields().size();
276: }
277:
278: if (docCount != reader.numDocs())
279: throw new RuntimeException("docCount=" + docCount
280: + " but saw " + docCount
281: + " undeleted docs");
282:
283: out.println("OK [" + totFields
284: + " total field count; avg "
285: + nf.format((((float) totFields) / docCount))
286: + " fields per doc]");
287:
288: out.print(" test: term vectors........");
289: int totVectors = 0;
290: for (int j = 0; j < info.docCount; j++)
291: if (!reader.isDeleted(j)) {
292: TermFreqVector[] tfv = reader
293: .getTermFreqVectors(j);
294: if (tfv != null)
295: totVectors += tfv.length;
296: }
297:
298: out.println("OK [" + totVectors
299: + " total vector count; avg "
300: + nf.format((((float) totVectors) / docCount))
301: + " term/freq vector fields per doc]");
302: out.println("");
303:
304: } catch (Throwable t) {
305: out.println("FAILED");
306: String comment;
307: if (doFix)
308: comment = "will remove reference to this segment (-fix is specified)";
309: else
310: comment = "would remove reference to this segment (-fix was not specified)";
311: out.println(" WARNING: " + comment
312: + "; full exception:");
313: t.printStackTrace(out);
314: out.println("");
315: totLoseDocCount += toLoseDocCount;
316: numBadSegments++;
317: changed = true;
318: continue;
319: } finally {
320: if (reader != null)
321: reader.close();
322: }
323:
324: // Keeper
325: newSIS.add(info.clone());
326: }
327:
328: if (!changed) {
329: out.println("No problems were detected with this index.\n");
330: return true;
331: } else {
332: out.println("WARNING: " + numBadSegments
333: + " broken segments detected");
334: if (doFix)
335: out.println("WARNING: " + totLoseDocCount
336: + " documents will be lost");
337: else
338: out
339: .println("WARNING: "
340: + totLoseDocCount
341: + " documents would be lost if -fix were specified");
342: out.println();
343: }
344:
345: if (doFix) {
346: out
347: .println("NOTE: will write new segments file in 5 seconds; this will remove "
348: + totLoseDocCount
349: + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
350: for (int i = 0; i < 5; i++) {
351: try {
352: Thread.sleep(1000);
353: } catch (InterruptedException ie) {
354: Thread.currentThread().interrupt();
355: i--;
356: continue;
357: }
358:
359: out.println(" " + (5 - i) + "...");
360: }
361: out.print("Writing...");
362: try {
363: newSIS.write(dir);
364: } catch (Throwable t) {
365: out.println("FAILED; exiting");
366: t.printStackTrace(out);
367: return false;
368: }
369: out.println("OK");
370: out.println("Wrote new segments file \""
371: + newSIS.getCurrentSegmentFileName() + "\"");
372: } else {
373: out
374: .println("NOTE: would write new segments file [-fix was not specified]");
375: }
376: out.println("");
377:
378: return false;
379: }
380:
381: public static void main(String[] args) throws Throwable {
382:
383: boolean doFix = false;
384: for (int i = 0; i < args.length; i++)
385: if (args[i].equals("-fix")) {
386: doFix = true;
387: break;
388: }
389:
390: if (args.length != (doFix ? 2 : 1)) {
391: out
392: .println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix]\n"
393: + "\n"
394: + " -fix: actually write a new segments_N file, removing any problematic segments\n"
395: + "\n"
396: + "**WARNING**: -fix should only be used on an emergency basis as it will cause\n"
397: + "documents (perhaps many) to be permanently removed from the index. Always make\n"
398: + "a backup copy of your index before running this! Do not run this tool on an index\n"
399: + "that is actively being written to. You have been warned!\n"
400: + "\n"
401: + "Run without -fix, this tool will open the index, report version information\n"
402: + "and report any exceptions it hits and what action it would take if -fix were\n"
403: + "specified. With -fix, this tool will remove any segments that have issues and\n"
404: + "write a new segments_N file. This means all documents contained in the affected\n"
405: + "segments will be removed.\n"
406: + "\n"
407: + "This tool exits with exit code 1 if the index cannot be opened or has has any\n"
408: + "corruption, else 0.\n");
409: System.exit(1);
410: }
411:
412: final String dirName = args[0];
413: out.println("\nOpening index @ " + dirName + "\n");
414: Directory dir = null;
415: try {
416: dir = FSDirectory.getDirectory(dirName);
417: } catch (Throwable t) {
418: out.println("ERROR: could not open directory \"" + dirName
419: + "\"; exiting");
420: t.printStackTrace(out);
421: System.exit(1);
422: }
423:
424: boolean isClean = check(dir, doFix);
425:
426: final int exitCode;
427: if (isClean)
428: exitCode = 0;
429: else
430: exitCode = 1;
431: System.exit(exitCode);
432: }
433: }
|