001: package org.apache.lucene.index.memory;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.BufferedReader;
021: import java.io.ByteArrayInputStream;
022: import java.io.File;
023: import java.io.FileInputStream;
024: import java.io.FilenameFilter;
025: import java.io.IOException;
026: import java.io.InputStream;
027: import java.io.InputStreamReader;
028: import java.nio.ByteBuffer;
029: import java.nio.charset.Charset;
030: import java.util.ArrayList;
031: import java.util.Enumeration;
032: import java.util.LinkedHashSet;
033: import java.util.Set;
034:
035: import junit.framework.TestCase;
036:
037: import org.apache.lucene.analysis.Analyzer;
038: import org.apache.lucene.analysis.SimpleAnalyzer;
039: import org.apache.lucene.analysis.StopAnalyzer;
040: import org.apache.lucene.analysis.StopFilter;
041: import org.apache.lucene.analysis.standard.StandardAnalyzer;
042: import org.apache.lucene.document.Document;
043: import org.apache.lucene.document.Field;
044: import org.apache.lucene.index.IndexWriter;
045: import org.apache.lucene.queryParser.ParseException;
046: import org.apache.lucene.queryParser.QueryParser;
047: import org.apache.lucene.search.HitCollector;
048: import org.apache.lucene.search.IndexSearcher;
049: import org.apache.lucene.search.Query;
050: import org.apache.lucene.search.Searcher;
051: import org.apache.lucene.store.Directory;
052: import org.apache.lucene.store.RAMDirectory;
053:
054: /**
055: Verifies that Lucene MemoryIndex and RAMDirectory have the same behaviour,
056: returning the same results for any given query.
057: Runs a set of queries against a set of files and compares results for identity.
058: Can also be used as a simple benchmark.
059: <p>
060: Example usage:
061: <pre>
062: cd lucene-svn
063: java -server -cp ~/unix/java/share/misc/junit/junit.jar:build/classes:build/lucene-core-2.1-dev.jar:build/contrib/memory/classes/test:build/contrib/memory/classes/java org.apache.lucene.index.memory.MemoryIndexTest 1 1 memram @contrib/memory/src/test/org/apache/lucene/index/memory/testqueries.txt *.txt *.html *.xml xdocs/*.xml src/test/org/apache/lucene/queryParser/*.java contrib/memory/src/java/org/apache/lucene/index/memory/*.java
064: </pre>
065: where testqueries.txt is a file with one query per line, such as:
066: <pre>
067: #
068: # queries extracted from TestQueryParser.java
069: #
070: Apache
071: Apach~ AND Copy*
072:
073: a AND b
074: (a AND b)
075: c OR (a AND b)
076: a AND NOT b
077: a AND -b
078: a AND !b
079: a && b
080: a && ! b
081:
082: a OR b
083: a || b
084: a OR !b
085: a OR ! b
086: a OR -b
087:
088: +term -term term
089: foo:term AND field:anotherTerm
090: term AND "phrase phrase"
091: "hello there"
092:
093: germ term^2.0
094: (term)^2.0
095: (germ term)^2.0
096: term^2.0
097: term^2
098: "germ term"^2.0
099: "term germ"^2
100:
101: (foo OR bar) AND (baz OR boo)
102: ((a OR b) AND NOT c) OR d
103: +(apple "steve jobs") -(foo bar baz)
104: +title:(dog OR cat) -author:"bob dole"
105:
106:
107: a&b
108: a&&b
109: .NET
110:
111: "term germ"~2
112: "term germ"~2 flork
113: "term"~2
114: "~2 germ"
115: "term germ"~2^2
116:
117: 3
118: term 1.0 1 2
119: term term1 term2
120:
121: term*
122: term*^2
123: term~
124: term~0.7
125: term~^2
126: term^2~
127: term*germ
128: term*germ^3
129:
130:
131: term*
132: Term*
133: TERM*
134: term*
135: Term*
136: TERM*
137:
138: // Then 'full' wildcard queries:
139: te?m
140: Te?m
141: TE?M
142: Te?m*gerM
143: te?m
144: Te?m
145: TE?M
146: Te?m*gerM
147:
148: term term term
149: term +stop term
150: term -stop term
151: drop AND stop AND roll
152: term phrase term
153: term AND NOT phrase term
154: stop
155:
156:
157: [ a TO c]
158: [ a TO c ]
159: { a TO c}
160: { a TO c }
161: { a TO c }^2.0
162: [ a TO c] OR bar
163: [ a TO c] AND bar
164: ( bar blar { a TO c})
165: gack ( bar blar { a TO c})
166:
167:
168: +weltbank +worlbank
169: +weltbank\n+worlbank
170: weltbank \n+worlbank
171: weltbank \n +worlbank
172: +weltbank\r+worlbank
173: weltbank \r+worlbank
174: weltbank \r +worlbank
175: +weltbank\r\n+worlbank
176: weltbank \r\n+worlbank
177: weltbank \r\n +worlbank
178: weltbank \r \n +worlbank
179: +weltbank\t+worlbank
180: weltbank \t+worlbank
181: weltbank \t +worlbank
182:
183:
184: term term term
185: term +term term
186: term term +term
187: term +term +term
188: -term term term
189:
190:
191: on^1.0
192: "hello"^2.0
193: hello^2.0
194: "on"^1.0
195: the^3
196: </pre>
197:
198: @author whoschek.AT.lbl.DOT.gov
199: */
200: public class MemoryIndexTest extends TestCase {
201:
202: private Analyzer analyzer;
203: private boolean fastMode = false;
204:
205: private final boolean verbose = false;
206:
207: private static final String FIELD_NAME = "content";
208:
209: /** Runs the tests and/or benchmark */
210: public static void main(String[] args) throws Throwable {
211: new MemoryIndexTest().run(args);
212: }
213:
214: /* all files will be open relative to this */
215: public String fileDir;
216:
217: public void setUp() {
218: fileDir = System.getProperty("lucene.common.dir", null);
219: }
220:
221: // public void tearDown() {}
222:
223: public void testMany() throws Throwable {
224: String[] files = listFiles(new String[] {
225: "*.txt",
226: "*.html",
227: "*.xml",
228: "xdocs/*.xml",
229: "src/java/test/org/apache/lucene/queryParser/*.java",
230: "contrib/memory/src/java/org/apache/lucene/index/memory/*.java", });
231: System.out.println("files = " + java.util.Arrays.asList(files));
232: String[] xargs = new String[] {
233: "1",
234: "1",
235: "memram",
236: "@contrib/memory/src/test/org/apache/lucene/index/memory/testqueries.txt", };
237: String[] args = new String[xargs.length + files.length];
238: System.arraycopy(xargs, 0, args, 0, xargs.length);
239: System.arraycopy(files, 0, args, xargs.length, files.length);
240: run(args);
241: }
242:
243: private void run(String[] args) throws Throwable {
244: int k = -1;
245:
246: int iters = 1;
247: if (args.length > ++k)
248: iters = Math.max(1, Integer.parseInt(args[k]));
249:
250: int runs = 1;
251: if (args.length > ++k)
252: runs = Math.max(1, Integer.parseInt(args[k]));
253:
254: String cmd = "memram";
255: if (args.length > ++k)
256: cmd = args[k];
257: boolean useMemIndex = cmd.indexOf("mem") >= 0;
258: boolean useRAMIndex = cmd.indexOf("ram") >= 0;
259:
260: String[] queries = { "term", "term*", "term~", "Apache",
261: "Apach~ AND Copy*" };
262: if (args.length > ++k) {
263: String arg = args[k];
264: if (arg.startsWith("@"))
265: queries = readLines(new File(fileDir, arg.substring(1)));
266: else
267: queries = new String[] { arg };
268: }
269:
270: File[] files = new File[] { new File("CHANGES.txt"),
271: new File("LICENSE.txt") };
272: if (args.length > ++k) {
273: files = new File[args.length - k];
274: for (int i = k; i < args.length; i++) {
275: files[i - k] = new File(args[i]);
276: }
277: }
278:
279: boolean toLowerCase = true;
280: // boolean toLowerCase = false;
281: // Set stopWords = null;
282: Set stopWords = StopFilter
283: .makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
284:
285: Analyzer[] analyzers = new Analyzer[] { new SimpleAnalyzer(),
286: new StopAnalyzer(), new StandardAnalyzer(),
287: PatternAnalyzer.DEFAULT_ANALYZER,
288: // new WhitespaceAnalyzer(),
289: // new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
290: // new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),
291: // new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
292: };
293:
294: for (int iter = 0; iter < iters; iter++) {
295: System.out.println("\n########### iteration=" + iter);
296: long start = System.currentTimeMillis();
297: long bytes = 0;
298:
299: for (int anal = 0; anal < analyzers.length; anal++) {
300: this .analyzer = analyzers[anal];
301:
302: for (int i = 0; i < files.length; i++) {
303: File file = files[i];
304: if (!file.exists() || file.isDirectory())
305: continue; // ignore
306: bytes += file.length();
307: String text = toString(new FileInputStream(file),
308: null);
309: Document doc = createDocument(text);
310: if (verbose)
311: System.out
312: .println("\n*********** FILE=" + file);
313:
314: for (int q = 0; q < queries.length; q++) {
315: try {
316: Query query = parseQuery(queries[q]);
317:
318: boolean measureIndexing = false; // toggle this to measure query performance
319: MemoryIndex memind = null;
320: if (useMemIndex && !measureIndexing)
321: memind = createMemoryIndex(doc);
322: RAMDirectory ramind = null;
323: if (useRAMIndex && !measureIndexing)
324: ramind = createRAMIndex(doc);
325:
326: for (int run = 0; run < runs; run++) {
327: float score1 = 0.0f;
328: float score2 = 0.0f;
329: if (useMemIndex && measureIndexing)
330: memind = createMemoryIndex(doc);
331: if (useMemIndex)
332: score1 = query(memind, query);
333: if (useRAMIndex && measureIndexing)
334: ramind = createRAMIndex(doc);
335: if (useRAMIndex)
336: score2 = query(ramind, query);
337: if (useMemIndex && useRAMIndex) {
338: if (verbose)
339: System.out.println("diff="
340: + (score1 - score2)
341: + ", query="
342: + queries[q] + ", s1="
343: + score1 + ", s2="
344: + score2);
345: if (score1 != score2
346: || score1 < 0.0f
347: || score2 < 0.0f
348: || score1 > 1.0f
349: || score2 > 1.0f) {
350: throw new IllegalStateException(
351: "BUG DETECTED:"
352: + (i * (q + 1))
353: + " at query="
354: + queries[q]
355: + ", file="
356: + file
357: + ", anal="
358: + analyzer);
359: }
360: }
361: }
362:
363: } catch (Throwable t) {
364: if (t instanceof OutOfMemoryError)
365: t.printStackTrace();
366: System.out.println("Fatal error at query="
367: + queries[q] + ", file=" + file
368: + ", anal=" + analyzer);
369: throw t;
370: }
371: }
372: }
373: }
374: long end = System.currentTimeMillis();
375: System.out.println("\nsecs = " + ((end - start) / 1000.0f));
376: System.out
377: .println("queries/sec= "
378: + (1.0f * runs * queries.length
379: * analyzers.length * files.length / ((end - start) / 1000.0f)));
380: float mb = (1.0f * bytes * queries.length * runs)
381: / (1024.0f * 1024.0f);
382: System.out.println("MB/sec = "
383: + (mb / ((end - start) / 1000.0f)));
384: }
385:
386: if (useMemIndex && useRAMIndex)
387: System.out.println("No bug found. done.");
388: else
389: System.out
390: .println("Done benchmarking (without checking correctness).");
391: }
392:
393: // returns file line by line, ignoring empty lines and comments
394: private String[] readLines(File file) throws Exception {
395: BufferedReader reader = new BufferedReader(
396: new InputStreamReader(new FileInputStream(file)));
397: ArrayList lines = new ArrayList();
398: String line;
399: while ((line = reader.readLine()) != null) {
400: String t = line.trim();
401: if (t.length() > 0 && t.charAt(0) != '#'
402: && (!t.startsWith("//"))) {
403: lines.add(line);
404: }
405: }
406: reader.close();
407:
408: String[] result = new String[lines.size()];
409: lines.toArray(result);
410: return result;
411: }
412:
413: private Document createDocument(String content) {
414: Document doc = new Document();
415: doc
416: .add(new Field(FIELD_NAME, content, Field.Store.NO,
417: Field.Index.TOKENIZED,
418: Field.TermVector.WITH_POSITIONS));
419: return doc;
420: }
421:
422: private MemoryIndex createMemoryIndex(Document doc) {
423: MemoryIndex index = new MemoryIndex();
424: Enumeration iter = doc.fields();
425: while (iter.hasMoreElements()) {
426: Field field = (Field) iter.nextElement();
427: index.addField(field.name(), field.stringValue(), analyzer);
428: }
429: return index;
430: }
431:
432: private RAMDirectory createRAMIndex(Document doc) {
433: RAMDirectory dir = new RAMDirectory();
434: IndexWriter writer = null;
435: try {
436: writer = new IndexWriter(dir, analyzer, true);
437: writer.setMaxFieldLength(Integer.MAX_VALUE);
438: writer.addDocument(doc);
439: writer.optimize();
440: return dir;
441: } catch (IOException e) { // should never happen (RAMDirectory)
442: throw new RuntimeException(e);
443: } finally {
444: try {
445: if (writer != null)
446: writer.close();
447: } catch (IOException e) { // should never happen (RAMDirectory)
448: throw new RuntimeException(e);
449: }
450: }
451: }
452:
453: private float query(Object index, Query query) {
454: // System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
455: Searcher searcher = null;
456: try {
457: if (index instanceof Directory)
458: searcher = new IndexSearcher((Directory) index);
459: else
460: searcher = ((MemoryIndex) index).createSearcher();
461:
462: final float[] scores = new float[1]; // inits to 0.0f
463: searcher.search(query, new HitCollector() {
464: public void collect(int doc, float score) {
465: scores[0] = score;
466: }
467: });
468: float score = scores[0];
469: // Hits hits = searcher.search(query);
470: // float score = hits.length() > 0 ? hits.score(0) : 0.0f;
471: return score;
472: } catch (IOException e) { // should never happen (RAMDirectory)
473: throw new RuntimeException(e);
474: } finally {
475: try {
476: if (searcher != null)
477: searcher.close();
478: } catch (IOException e) { // should never happen (RAMDirectory)
479: throw new RuntimeException(e);
480: }
481: }
482: }
483:
484: private int getMemorySize(Object index) {
485: if (index instanceof Directory) {
486: try {
487: Directory dir = (Directory) index;
488: int size = 0;
489: String[] fileNames = dir.list();
490: for (int i = 0; i < fileNames.length; i++) {
491: size += dir.fileLength(fileNames[i]);
492: }
493: return size;
494: } catch (IOException e) { // can never happen (RAMDirectory)
495: throw new RuntimeException(e);
496: }
497: } else {
498: return ((MemoryIndex) index).getMemorySize();
499: }
500: }
501:
502: private Query parseQuery(String expression) throws ParseException {
503: QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
504: // parser.setPhraseSlop(0);
505: return parser.parse(expression);
506: }
507:
508: /** returns all files matching the given file name patterns (quick n'dirty) */
509: static String[] listFiles(String[] fileNames) {
510: LinkedHashSet allFiles = new LinkedHashSet();
511: for (int i = 0; i < fileNames.length; i++) {
512: int k;
513: if ((k = fileNames[i].indexOf("*")) < 0) {
514: allFiles.add(fileNames[i]);
515: } else {
516: String prefix = fileNames[i].substring(0, k);
517: if (prefix.length() == 0)
518: prefix = ".";
519: final String suffix = fileNames[i].substring(k + 1);
520: File[] files = new File(prefix)
521: .listFiles(new FilenameFilter() {
522: public boolean accept(File dir, String name) {
523: return name.endsWith(suffix);
524: }
525: });
526: if (files != null) {
527: for (int j = 0; j < files.length; j++) {
528: allFiles.add(files[j].getPath());
529: }
530: }
531: }
532: }
533:
534: String[] result = new String[allFiles.size()];
535: allFiles.toArray(result);
536: return result;
537: }
538:
539: // trick to detect default platform charset
540: private static final Charset DEFAULT_PLATFORM_CHARSET = Charset
541: .forName(new InputStreamReader(new ByteArrayInputStream(
542: new byte[0])).getEncoding());
543:
544: // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
545: private static String toString(InputStream input, Charset charset)
546: throws IOException {
547: if (charset == null)
548: charset = DEFAULT_PLATFORM_CHARSET;
549: byte[] data = toByteArray(input);
550: return charset.decode(ByteBuffer.wrap(data)).toString();
551: }
552:
553: private static byte[] toByteArray(InputStream input)
554: throws IOException {
555: try {
556: // safe and fast even if input.available() behaves weird or buggy
557: int len = Math.max(256, input.available());
558: byte[] buffer = new byte[len];
559: byte[] output = new byte[len];
560:
561: len = 0;
562: int n;
563: while ((n = input.read(buffer)) >= 0) {
564: if (len + n > output.length) { // grow capacity
565: byte tmp[] = new byte[Math.max(output.length << 1,
566: len + n)];
567: System.arraycopy(output, 0, tmp, 0, len);
568: System.arraycopy(buffer, 0, tmp, len, n);
569: buffer = output; // use larger buffer for future larger bulk reads
570: output = tmp;
571: } else {
572: System.arraycopy(buffer, 0, output, len, n);
573: }
574: len += n;
575: }
576:
577: if (len == output.length)
578: return output;
579: buffer = null; // help gc
580: buffer = new byte[len];
581: System.arraycopy(output, 0, buffer, 0, len);
582: return buffer;
583: } finally {
584: if (input != null)
585: input.close();
586: }
587: }
588:
589: }
|