001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: import org.apache.lucene.store.*;
018: import org.apache.lucene.document.*;
019: import org.apache.lucene.analysis.*;
020:
021: import org.apache.lucene.util.LuceneTestCase;
022:
023: import java.util.*;
024: import java.io.ByteArrayOutputStream;
025: import java.io.PrintStream;
026: import java.io.IOException;
027:
028: import junit.framework.TestCase;
029:
030: public class TestStressIndexing2 extends LuceneTestCase {
031: static int maxFields = 4;
032: static int bigFieldSize = 10;
033: static boolean sameFieldOrder = false;
034: static boolean autoCommit = false;
035: static int mergeFactor = 3;
036: static int maxBufferedDocs = 3;
037: static int seed = 0;
038:
039: static Random r = new Random(0);
040:
041: public void testRandom() throws Exception {
042: Directory dir1 = new MockRAMDirectory();
043: // dir1 = FSDirectory.getDirectory("foofoofoo");
044: Directory dir2 = new MockRAMDirectory();
045: // mergeFactor=2; maxBufferedDocs=2; Map docs = indexRandom(1, 3, 2, dir1);
046: Map docs = indexRandom(10, 100, 100, dir1);
047: indexSerial(docs, dir2);
048:
049: // verifying verify
050: // verifyEquals(dir1, dir1, "id");
051: // verifyEquals(dir2, dir2, "id");
052:
053: verifyEquals(dir1, dir2, "id");
054: }
055:
056: public void testMultiConfig() throws Exception {
057: // test lots of smaller different params together
058: for (int i = 0; i < 100; i++) { // increase iterations for better testing
059: sameFieldOrder = r.nextBoolean();
060: autoCommit = r.nextBoolean();
061: mergeFactor = r.nextInt(3) + 2;
062: maxBufferedDocs = r.nextInt(3) + 2;
063: seed++;
064:
065: int nThreads = r.nextInt(5) + 1;
066: int iter = r.nextInt(10) + 1;
067: int range = r.nextInt(20) + 1;
068:
069: Directory dir1 = new MockRAMDirectory();
070: Directory dir2 = new MockRAMDirectory();
071: Map docs = indexRandom(nThreads, iter, range, dir1);
072: indexSerial(docs, dir2);
073: verifyEquals(dir1, dir2, "id");
074: }
075: }
076:
077: static Term idTerm = new Term("id", "");
078: IndexingThread[] threads;
079: static Comparator fieldNameComparator = new Comparator() {
080: public int compare(Object o1, Object o2) {
081: return ((Fieldable) o1).name().compareTo(
082: ((Fieldable) o2).name());
083: }
084: };
085:
086: // This test avoids using any extra synchronization in the multiple
087: // indexing threads to test that IndexWriter does correctly synchronize
088: // everything.
089:
090: public Map indexRandom(int nThreads, int iterations, int range,
091: Directory dir) throws IOException, InterruptedException {
092: IndexWriter w = new IndexWriter(dir, autoCommit,
093: new WhitespaceAnalyzer(), true);
094: w.setUseCompoundFile(false);
095: /***
096: w.setMaxMergeDocs(Integer.MAX_VALUE);
097: w.setMaxFieldLength(10000);
098: w.setRAMBufferSizeMB(1);
099: w.setMergeFactor(10);
100: ***/
101:
102: // force many merges
103: w.setMergeFactor(mergeFactor);
104: w.setRAMBufferSizeMB(.1);
105: w.setMaxBufferedDocs(maxBufferedDocs);
106:
107: threads = new IndexingThread[nThreads];
108: for (int i = 0; i < threads.length; i++) {
109: IndexingThread th = new IndexingThread();
110: th.w = w;
111: th.base = 1000000 * i;
112: th.range = range;
113: th.iterations = iterations;
114: threads[i] = th;
115: }
116:
117: for (int i = 0; i < threads.length; i++) {
118: threads[i].start();
119: }
120: for (int i = 0; i < threads.length; i++) {
121: threads[i].join();
122: }
123:
124: // w.optimize();
125: w.close();
126:
127: Map docs = new HashMap();
128: for (int i = 0; i < threads.length; i++) {
129: IndexingThread th = threads[i];
130: synchronized (th) {
131: docs.putAll(th.docs);
132: }
133: }
134:
135: return docs;
136: }
137:
138: public static void indexSerial(Map docs, Directory dir)
139: throws IOException {
140: IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer());
141:
142: // index all docs in a single thread
143: Iterator iter = docs.values().iterator();
144: while (iter.hasNext()) {
145: Document d = (Document) iter.next();
146: ArrayList fields = new ArrayList();
147: fields.addAll(d.getFields());
148: // put fields in same order each time
149: Collections.sort(fields, fieldNameComparator);
150:
151: Document d1 = new Document();
152: d1.setBoost(d.getBoost());
153: for (int i = 0; i < fields.size(); i++) {
154: d1.add((Fieldable) fields.get(i));
155: }
156: w.addDocument(d1);
157: // System.out.println("indexing "+d1);
158: }
159:
160: w.close();
161: }
162:
163: public static void verifyEquals(Directory dir1, Directory dir2,
164: String idField) throws IOException {
165: IndexReader r1 = IndexReader.open(dir1);
166: IndexReader r2 = IndexReader.open(dir2);
167: verifyEquals(r1, r2, idField);
168: r1.close();
169: r2.close();
170: }
171:
172: public static void verifyEquals(IndexReader r1, IndexReader r2,
173: String idField) throws IOException {
174: assertEquals(r1.numDocs(), r2.numDocs());
175: boolean hasDeletes = !(r1.maxDoc() == r2.maxDoc() && r1
176: .numDocs() == r1.maxDoc());
177:
178: int[] r2r1 = new int[r2.maxDoc()]; // r2 id to r1 id mapping
179:
180: TermDocs termDocs1 = r1.termDocs();
181: TermDocs termDocs2 = r2.termDocs();
182:
183: // create mapping from id2 space to id2 based on idField
184: idField = idField.intern();
185: TermEnum termEnum = r1.terms(new Term(idField, ""));
186: do {
187: Term term = termEnum.term();
188: if (term == null || term.field() != idField)
189: break;
190:
191: termDocs1.seek(termEnum);
192: assertTrue(termDocs1.next());
193: int id1 = termDocs1.doc();
194: assertFalse(termDocs1.next());
195:
196: termDocs2.seek(termEnum);
197: assertTrue(termDocs2.next());
198: int id2 = termDocs2.doc();
199: assertFalse(termDocs2.next());
200:
201: r2r1[id2] = id1;
202:
203: // verify stored fields are equivalent
204: verifyEquals(r1.document(id1), r2.document(id2));
205:
206: try {
207: // verify term vectors are equivalent
208: verifyEquals(r1.getTermFreqVectors(id1), r2
209: .getTermFreqVectors(id2));
210: } catch (java.lang.Error e) {
211: System.out.println("FAILED id=" + term + " id1=" + id1
212: + " id2=" + id2);
213: TermFreqVector[] tv1 = r1.getTermFreqVectors(id1);
214: System.out.println(" d1=" + tv1);
215: if (tv1 != null)
216: for (int i = 0; i < tv1.length; i++)
217: System.out.println(" " + i + ": " + tv1[i]);
218:
219: TermFreqVector[] tv2 = r2.getTermFreqVectors(id2);
220: System.out.println(" d2=" + tv2);
221: if (tv2 != null)
222: for (int i = 0; i < tv2.length; i++)
223: System.out.println(" " + i + ": " + tv2[i]);
224:
225: throw e;
226: }
227:
228: } while (termEnum.next());
229:
230: termEnum.close();
231:
232: // Verify postings
233: TermEnum termEnum1 = r1.terms(new Term("", ""));
234: TermEnum termEnum2 = r2.terms(new Term("", ""));
235:
236: // pack both doc and freq into single element for easy sorting
237: long[] info1 = new long[r1.numDocs()];
238: long[] info2 = new long[r2.numDocs()];
239:
240: for (;;) {
241: Term term1, term2;
242:
243: // iterate until we get some docs
244: int len1;
245: for (;;) {
246: len1 = 0;
247: term1 = termEnum1.term();
248: if (term1 == null)
249: break;
250: termDocs1.seek(termEnum1);
251: while (termDocs1.next()) {
252: int d1 = termDocs1.doc();
253: int f1 = termDocs1.freq();
254: info1[len1] = (((long) d1) << 32) | f1;
255: len1++;
256: }
257: if (len1 > 0)
258: break;
259: if (!termEnum1.next())
260: break;
261: }
262:
263: // iterate until we get some docs
264: int len2;
265: for (;;) {
266: len2 = 0;
267: term2 = termEnum2.term();
268: if (term2 == null)
269: break;
270: termDocs2.seek(termEnum2);
271: while (termDocs2.next()) {
272: int d2 = termDocs2.doc();
273: int f2 = termDocs2.freq();
274: info2[len2] = (((long) r2r1[d2]) << 32) | f2;
275: len2++;
276: }
277: if (len2 > 0)
278: break;
279: if (!termEnum2.next())
280: break;
281: }
282:
283: if (!hasDeletes)
284: assertEquals(termEnum1.docFreq(), termEnum2.docFreq());
285:
286: assertEquals(len1, len2);
287: if (len1 == 0)
288: break; // no more terms
289:
290: assertEquals(term1, term2);
291:
292: // sort info2 to get it into ascending docid
293: Arrays.sort(info2, 0, len2);
294:
295: // now compare
296: for (int i = 0; i < len1; i++) {
297: assertEquals(info1[i], info2[i]);
298: }
299:
300: termEnum1.next();
301: termEnum2.next();
302: }
303: }
304:
305: public static void verifyEquals(Document d1, Document d2) {
306: List ff1 = d1.getFields();
307: List ff2 = d2.getFields();
308:
309: Collections.sort(ff1, fieldNameComparator);
310: Collections.sort(ff2, fieldNameComparator);
311:
312: if (ff1.size() != ff2.size()) {
313: System.out.println(ff1);
314: System.out.println(ff2);
315: assertEquals(ff1.size(), ff2.size());
316: }
317:
318: for (int i = 0; i < ff1.size(); i++) {
319: Fieldable f1 = (Fieldable) ff1.get(i);
320: Fieldable f2 = (Fieldable) ff2.get(i);
321: if (f1.isBinary()) {
322: assert (f2.isBinary());
323: //TODO
324: } else {
325: String s1 = f1.stringValue();
326: String s2 = f2.stringValue();
327: if (!s1.equals(s2)) {
328: // print out whole doc on error
329: System.out.println(ff1);
330: System.out.println(ff2);
331: assertEquals(s1, s2);
332: }
333: }
334: }
335: }
336:
337: public static void verifyEquals(TermFreqVector[] d1,
338: TermFreqVector[] d2) {
339: if (d1 == null) {
340: assertTrue(d2 == null);
341: return;
342: }
343: assertTrue(d2 != null);
344:
345: assertEquals(d1.length, d2.length);
346: for (int i = 0; i < d1.length; i++) {
347: TermFreqVector v1 = d1[i];
348: TermFreqVector v2 = d2[i];
349: assertEquals(v1.size(), v2.size());
350: int numTerms = v1.size();
351: String[] terms1 = v1.getTerms();
352: String[] terms2 = v2.getTerms();
353: int[] freq1 = v1.getTermFrequencies();
354: int[] freq2 = v2.getTermFrequencies();
355: for (int j = 0; j < numTerms; j++) {
356: if (!terms1[j].equals(terms2[j]))
357: assertEquals(terms1[j], terms2[j]);
358: assertEquals(freq1[j], freq2[j]);
359: }
360: if (v1 instanceof TermPositionVector) {
361: assertTrue(v2 instanceof TermPositionVector);
362: TermPositionVector tpv1 = (TermPositionVector) v1;
363: TermPositionVector tpv2 = (TermPositionVector) v2;
364: for (int j = 0; j < numTerms; j++) {
365: int[] pos1 = tpv1.getTermPositions(j);
366: int[] pos2 = tpv2.getTermPositions(j);
367: assertEquals(pos1.length, pos2.length);
368: TermVectorOffsetInfo[] offsets1 = tpv1
369: .getOffsets(j);
370: TermVectorOffsetInfo[] offsets2 = tpv2
371: .getOffsets(j);
372: if (offsets1 == null)
373: assertTrue(offsets2 == null);
374: else
375: assertTrue(offsets2 != null);
376: for (int k = 0; k < pos1.length; k++) {
377: assertEquals(pos1[k], pos2[k]);
378: if (offsets1 != null) {
379: assertEquals(offsets1[k].getStartOffset(),
380: offsets2[k].getStartOffset());
381: assertEquals(offsets1[k].getEndOffset(),
382: offsets2[k].getEndOffset());
383: }
384: }
385: }
386: }
387: }
388: }
389:
390: private static class IndexingThread extends Thread {
391: IndexWriter w;
392: int base;
393: int range;
394: int iterations;
395: Map docs = new HashMap(); // Map<String,Document>
396: Random r;
397:
398: public int nextInt(int lim) {
399: return r.nextInt(lim);
400: }
401:
402: public String getString(int nTokens) {
403: nTokens = nTokens != 0 ? nTokens : r.nextInt(4) + 1;
404: // avoid StringBuffer because it adds extra synchronization.
405: char[] arr = new char[nTokens * 2];
406: for (int i = 0; i < nTokens; i++) {
407: arr[i * 2] = (char) ('A' + r.nextInt(10));
408: arr[i * 2 + 1] = ' ';
409: }
410: return new String(arr);
411: }
412:
413: public void indexDoc() throws IOException {
414: Document d = new Document();
415:
416: ArrayList fields = new ArrayList();
417: int id = base + nextInt(range);
418: String idString = "" + id;
419: Field idField = new Field("id", idString, Field.Store.YES,
420: Field.Index.NO_NORMS);
421: fields.add(idField);
422:
423: int nFields = nextInt(maxFields);
424: for (int i = 0; i < nFields; i++) {
425:
426: Field.TermVector tvVal = Field.TermVector.NO;
427: switch (nextInt(4)) {
428: case 0:
429: tvVal = Field.TermVector.NO;
430: break;
431: case 1:
432: tvVal = Field.TermVector.YES;
433: break;
434: case 2:
435: tvVal = Field.TermVector.WITH_POSITIONS;
436: break;
437: case 3:
438: tvVal = Field.TermVector.WITH_POSITIONS_OFFSETS;
439: break;
440: }
441:
442: switch (nextInt(4)) {
443: case 0:
444: fields.add(new Field("f0", getString(1),
445: Field.Store.YES, Field.Index.NO_NORMS,
446: tvVal));
447: break;
448: case 1:
449: fields.add(new Field("f1", getString(0),
450: Field.Store.NO, Field.Index.TOKENIZED,
451: tvVal));
452: break;
453: case 2:
454: fields.add(new Field("f2", getString(0),
455: Field.Store.YES, Field.Index.NO,
456: Field.TermVector.NO));
457: break;
458: case 3:
459: fields.add(new Field("f3", getString(bigFieldSize),
460: Field.Store.YES, Field.Index.TOKENIZED,
461: tvVal));
462: break;
463: }
464: }
465:
466: if (sameFieldOrder) {
467: Collections.sort(fields, fieldNameComparator);
468: } else {
469: // random placement of id field also
470: Collections.swap(fields, nextInt(fields.size()), 0);
471: }
472:
473: for (int i = 0; i < fields.size(); i++) {
474: d.add((Fieldable) fields.get(i));
475: }
476: w.updateDocument(idTerm.createTerm(idString), d);
477: // System.out.println("indexing "+d);
478: docs.put(idString, d);
479: }
480:
481: public void run() {
482: try {
483: r = new Random(base + range + seed);
484: for (int i = 0; i < iterations; i++) {
485: indexDoc();
486: }
487: } catch (Exception e) {
488: e.printStackTrace();
489: TestCase.fail(e.toString());
490: }
491:
492: synchronized (this) {
493: docs.size();
494: }
495: }
496: }
497:
498: }
|