001: package org.apache.lucene.benchmark.standard;
002:
003: import org.apache.lucene.analysis.Analyzer;
004: import org.apache.lucene.analysis.standard.StandardAnalyzer;
005: import org.apache.lucene.benchmark.AbstractBenchmarker;
006: import org.apache.lucene.benchmark.BenchmarkOptions;
007: import org.apache.lucene.benchmark.Benchmarker;
008: import org.apache.lucene.benchmark.stats.QueryData;
009: import org.apache.lucene.benchmark.stats.TestData;
010: import org.apache.lucene.benchmark.stats.TestRunData;
011: import org.apache.lucene.benchmark.stats.TimeData;
012: import org.apache.lucene.document.DateTools;
013: import org.apache.lucene.document.Document;
014: import org.apache.lucene.document.Field;
015: import org.apache.lucene.index.IndexReader;
016: import org.apache.lucene.index.IndexWriter;
017: import org.apache.lucene.queryParser.QueryParser;
018: import org.apache.lucene.search.Hits;
019: import org.apache.lucene.search.IndexSearcher;
020: import org.apache.lucene.search.Query;
021: import org.apache.lucene.store.FSDirectory;
022:
023: import java.io.*;
024: import java.text.DateFormat;
025: import java.text.SimpleDateFormat;
026: import java.util.*;
027:
028: /**
029: * Copyright 2005 The Apache Software Foundation
030: *
031: * Licensed under the Apache License, Version 2.0 (the "License");
032: * you may not use this file except in compliance with the License.
033: * You may obtain a copy of the License at
034: *
035: * http://www.apache.org/licenses/LICENSE-2.0
036: *
037: * Unless required by applicable law or agreed to in writing, software
038: * distributed under the License is distributed on an "AS IS" BASIS,
039: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
040: * See the License for the specific language governing permissions and
041: * limitations under the License.
042: */
043:
044: /**
045: * Reads in the Reuters Collection, downloaded from http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz
046: * in the workingDir/reuters and indexes them using the {@link org.apache.lucene.analysis.standard.StandardAnalyzer}
047: *<p/>
048: * Runs a standard set of documents through an Indexer and then runs a standard set of queries against the index.
049: *
050: * @see org.apache.lucene.benchmark.standard.StandardBenchmarker#benchmark(java.io.File, org.apache.lucene.benchmark.BenchmarkOptions)
051: *
052: * @deprecated use the byTask code instead. See http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/javadoc/org/apache/lucene/benchmark/byTask/package-summary.html .
053: **/
054: public class StandardBenchmarker extends AbstractBenchmarker implements
055: Benchmarker {
056: public static final String SOURCE_DIR = "reuters-out";
057:
058: public static final String INDEX_DIR = "index";
059: //30-MAR-1987 14:22:36.87
060: private static DateFormat format = new SimpleDateFormat(
061: "dd-MMM-yyyy kk:mm:ss.SSS", Locale.US);
062: //DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.SHORT);
063: static {
064: format.setLenient(true);
065: }
066:
067: public StandardBenchmarker() {
068: }
069:
070: public TestData[] benchmark(File workingDir, BenchmarkOptions opts)
071: throws Exception {
072: StandardOptions options = (StandardOptions) opts;
073: workingDir.mkdirs();
074: File sourceDir = getSourceDirectory(workingDir);
075:
076: sourceDir.mkdirs();
077: File indexDir = new File(workingDir, INDEX_DIR);
078: indexDir.mkdirs();
079: Analyzer a = new StandardAnalyzer();
080: List queryList = new ArrayList(20);
081: queryList
082: .addAll(Arrays.asList(ReutersQueries.STANDARD_QUERIES));
083: queryList.addAll(Arrays.asList(ReutersQueries
084: .getPrebuiltQueries("body")));
085: Query[] qs = createQueries(queryList, a);
086: // Here you can limit the set of query benchmarks
087: QueryData[] qds = QueryData.getAll(qs);
088: // Here you can narrow down the set of test parameters
089: TestData[] params = TestData
090: .getTestDataMinMaxMergeAndMaxBuffered(
091: new File[] { sourceDir /*, jumboDir*/},
092: new Analyzer[] { a });//TestData.getAll(new File[]{sourceDir, jumboDir}, new Analyzer[]{a});
093: System.out.println("Testing " + params.length
094: + " different permutations.");
095: for (int i = 0; i < params.length; i++) {
096: try {
097: reset(indexDir);
098: params[i].setDirectory(FSDirectory
099: .getDirectory(indexDir));
100: params[i].setQueries(qds);
101: System.out.println(params[i]);
102: runBenchmark(params[i], options);
103: // Here you can collect and output the runData for further processing.
104: System.out.println(params[i].showRunData(params[i]
105: .getId()));
106: //bench.runSearchBenchmark(queries, dir);
107: params[i].getDirectory().close();
108: System.runFinalization();
109: System.gc();
110: } catch (Exception e) {
111: e.printStackTrace();
112: System.out.println("EXCEPTION: " + e.getMessage());
113: //break;
114: }
115: }
116: return params;
117: }
118:
119: protected File getSourceDirectory(File workingDir) {
120: return new File(workingDir, SOURCE_DIR);
121: }
122:
123: /**
124: * Run benchmark using supplied parameters.
125: *
126: * @param params benchmark parameters
127: * @throws Exception
128: */
129: protected void runBenchmark(TestData params, StandardOptions options)
130: throws Exception {
131: System.out.println("Start Time: " + new Date());
132: int runCount = options.getRunCount();
133: for (int i = 0; i < runCount; i++) {
134: TestRunData trd = new TestRunData();
135: trd.startRun();
136: trd.setId(String.valueOf(i));
137: IndexWriter iw = new IndexWriter(params.getDirectory(),
138: params.getAnalyzer(), true);
139: iw.setMergeFactor(params.getMergeFactor());
140: iw.setMaxBufferedDocs(params.getMaxBufferedDocs());
141:
142: iw.setUseCompoundFile(params.isCompound());
143: makeIndex(trd, params.getSource(), iw, true, true, false,
144: options);
145: if (params.isOptimize()) {
146: TimeData td = new TimeData("optimize");
147: trd.addData(td);
148: td.start();
149: iw.optimize();
150: td.stop();
151: trd.addData(td);
152: }
153: iw.close();
154: QueryData[] queries = params.getQueries();
155: if (queries != null) {
156: IndexReader ir = null;
157: IndexSearcher searcher = null;
158: for (int k = 0; k < queries.length; k++) {
159: QueryData qd = queries[k];
160: if (ir != null && qd.reopen) {
161: searcher.close();
162: ir.close();
163: ir = null;
164: searcher = null;
165: }
166: if (ir == null) {
167: ir = IndexReader.open(params.getDirectory());
168: searcher = new IndexSearcher(ir);
169: }
170: Document doc = null;
171: if (qd.warmup) {
172: TimeData td = new TimeData(qd.id + "-warm");
173: for (int m = 0; m < ir.maxDoc(); m++) {
174: td.start();
175: if (ir.isDeleted(m)) {
176: td.stop();
177: continue;
178: }
179: doc = ir.document(m);
180: td.stop();
181: }
182: trd.addData(td);
183: }
184: TimeData td = new TimeData(qd.id + "-srch");
185: td.start();
186: Hits h = searcher.search(qd.q);
187: //System.out.println("Hits Size: " + h.length() + " Query: " + qd.q);
188: td.stop();
189: trd.addData(td);
190: td = new TimeData(qd.id + "-trav");
191: if (h != null && h.length() > 0) {
192: for (int m = 0; m < h.length(); m++) {
193: td.start();
194: int id = h.id(m);
195: if (qd.retrieve) {
196: doc = ir.document(id);
197: }
198: td.stop();
199: }
200: }
201: trd.addData(td);
202: }
203: try {
204: if (searcher != null) {
205: searcher.close();
206: }
207: } catch (Exception e) {
208: }
209: ;
210: try {
211: if (ir != null) {
212: ir.close();
213: }
214: } catch (Exception e) {
215: }
216: ;
217: }
218: trd.endRun();
219: params.getRunData().add(trd);
220: //System.out.println(params[i].showRunData(params[i].getId()));
221: //params.showRunData(params.getId());
222: }
223: System.out.println("End Time: " + new Date());
224: }
225:
226: /**
227: * Parse the Reuters SGML and index:
228: * Date, Title, Dateline, Body
229: *
230: *
231: *
232: * @param in input file
233: * @return Lucene document
234: */
235: protected Document makeDocument(File in, String[] tags,
236: boolean stored, boolean tokenized, boolean tfv)
237: throws Exception {
238: Document doc = new Document();
239: // tag this document
240: if (tags != null) {
241: for (int i = 0; i < tags.length; i++) {
242: doc.add(new Field("tag" + i, tags[i],
243: stored == true ? Field.Store.YES
244: : Field.Store.NO,
245: tokenized == true ? Field.Index.TOKENIZED
246: : Field.Index.UN_TOKENIZED,
247: tfv == true ? Field.TermVector.YES
248: : Field.TermVector.NO));
249: }
250: }
251: doc.add(new Field("file", in.getCanonicalPath(),
252: stored == true ? Field.Store.YES : Field.Store.NO,
253: tokenized == true ? Field.Index.TOKENIZED
254: : Field.Index.UN_TOKENIZED,
255: tfv == true ? Field.TermVector.YES
256: : Field.TermVector.NO));
257: BufferedReader reader = new BufferedReader(new FileReader(in));
258: String line = null;
259: //First line is the date, 3rd is the title, rest is body
260: String dateStr = reader.readLine();
261: reader.readLine();//skip an empty line
262: String title = reader.readLine();
263: reader.readLine();//skip an empty line
264: StringBuffer body = new StringBuffer(1024);
265: while ((line = reader.readLine()) != null) {
266: body.append(line).append(' ');
267: }
268: reader.close();
269:
270: Date date = format.parse(dateStr.trim());
271:
272: doc.add(new Field("date", DateTools.dateToString(date,
273: DateTools.Resolution.SECOND), Field.Store.YES,
274: Field.Index.UN_TOKENIZED));
275:
276: if (title != null) {
277: doc.add(new Field("title", title,
278: stored == true ? Field.Store.YES : Field.Store.NO,
279: tokenized == true ? Field.Index.TOKENIZED
280: : Field.Index.UN_TOKENIZED,
281: tfv == true ? Field.TermVector.YES
282: : Field.TermVector.NO));
283: }
284: if (body.length() > 0) {
285: doc.add(new Field("body", body.toString(),
286: stored == true ? Field.Store.YES : Field.Store.NO,
287: tokenized == true ? Field.Index.TOKENIZED
288: : Field.Index.UN_TOKENIZED,
289: tfv == true ? Field.TermVector.YES
290: : Field.TermVector.NO));
291: }
292:
293: return doc;
294: }
295:
296: /**
297: * Make index, and collect time data.
298: *
299: * @param trd run data to populate
300: * @param srcDir directory with source files
301: * @param iw index writer, already open
302: * @param stored store values of fields
303: * @param tokenized tokenize fields
304: * @param tfv store term vectors
305: * @throws Exception
306: */
307: protected void makeIndex(TestRunData trd, File srcDir,
308: IndexWriter iw, boolean stored, boolean tokenized,
309: boolean tfv, StandardOptions options) throws Exception {
310: //File[] groups = srcDir.listFiles();
311: List files = new ArrayList();
312: getAllFiles(srcDir, null, files);
313: Document doc = null;
314: long cnt = 0L;
315: TimeData td = new TimeData();
316: td.name = "addDocument";
317: int scaleUp = options.getScaleUp();
318: int logStep = options.getLogStep();
319: int max = Math.min(files.size(), options
320: .getMaximumDocumentsToIndex());
321: for (int s = 0; s < scaleUp; s++) {
322: String[] tags = new String[] { srcDir.getName() + "/" + s };
323: int i = 0;
324: for (Iterator iterator = files.iterator(); iterator
325: .hasNext()
326: && i < max; i++) {
327: File file = (File) iterator.next();
328: doc = makeDocument(file, tags, stored, tokenized, tfv);
329: td.start();
330: iw.addDocument(doc);
331: td.stop();
332: cnt++;
333: if (cnt % logStep == 0) {
334: System.err.println(" - processed " + cnt
335: + ", run id=" + trd.getId());
336: trd.addData(td);
337: td.reset();
338: }
339: }
340: }
341: trd.addData(td);
342: }
343:
344: public static void getAllFiles(File srcDir, FileFilter filter,
345: List allFiles) {
346: File[] files = srcDir.listFiles(filter);
347: for (int i = 0; i < files.length; i++) {
348: File file = files[i];
349: if (file.isDirectory()) {
350: getAllFiles(file, filter, allFiles);
351: } else {
352: allFiles.add(file);
353: }
354: }
355: }
356:
357: /**
358: * Parse the strings containing Lucene queries.
359: *
360: * @param qs array of strings containing query expressions
361: * @param a analyzer to use when parsing queries
362: * @return array of Lucene queries
363: */
364: public static Query[] createQueries(List qs, Analyzer a) {
365: QueryParser qp = new QueryParser("body", a);
366: List queries = new ArrayList();
367: for (int i = 0; i < qs.size(); i++) {
368: try {
369: Object query = qs.get(i);
370: Query q = null;
371: if (query instanceof String) {
372: q = qp.parse((String) query);
373: } else if (query instanceof Query) {
374: q = (Query) query;
375: } else {
376: System.err.println("Unsupported Query Type: "
377: + query);
378: }
379: if (q != null) {
380: queries.add(q);
381: }
382:
383: } catch (Exception e) {
384: e.printStackTrace();
385: }
386: }
387: return (Query[]) queries.toArray(new Query[0]);
388: }
389:
390: /**
391: * Remove existing index.
392: *
393: * @throws Exception
394: */
395: protected void reset(File indexDir) throws Exception {
396: if (indexDir.exists()) {
397: fullyDelete(indexDir);
398: }
399: indexDir.mkdirs();
400: }
401:
402: /**
403: * Save a stream to a file.
404: *
405: * @param is input stream
406: * @param out output file
407: * @param closeInput if true, close the input stream when done.
408: * @throws Exception
409: */
410: protected void saveStream(InputStream is, File out,
411: boolean closeInput) throws Exception {
412: byte[] buf = new byte[4096];
413: FileOutputStream fos = new FileOutputStream(out);
414: int len = 0;
415: long total = 0L;
416: long time = System.currentTimeMillis();
417: long delta = time;
418: while ((len = is.read(buf)) > 0) {
419: fos.write(buf, 0, len);
420: total += len;
421: time = System.currentTimeMillis();
422: if (time - delta > 5000) {
423: System.err.println(" - copied " + total / 1024
424: + " kB...");
425: delta = time;
426: }
427: }
428: fos.flush();
429: fos.close();
430: if (closeInput) {
431: is.close();
432: }
433: }
434: }
|