001: package org.apache.lucene.benchmark.byTask.feeds;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.benchmark.byTask.utils.Config;
021:
022: import java.io.BufferedReader;
023: import java.io.File;
024: import java.io.FileReader;
025: import java.text.DateFormat;
026: import java.text.SimpleDateFormat;
027: import java.util.ArrayList;
028: import java.util.Date;
029: import java.util.Locale;
030:
031: /**
032: * A DocMaker using the Reuters collection for its input.
033: * <p>
034: * Config properties:<ul>
035: * <li>work.dir=<path to the root of docs and indexes dirs| Default: work></li>
036: * <li>docs.dir=<path to the docs dir| Default: reuters-out></li>
037: * </ul>
038: */
039: public class ReutersDocMaker extends BasicDocMaker {
040:
041: private ThreadLocal dateFormat = new ThreadLocal();
042: private File dataDir = null;
043: private ArrayList inputFiles = new ArrayList();
044: private int nextFile = 0;
045: private int iteration = 0;
046:
047: /* (non-Javadoc)
048: * @see SimpleDocMaker#setConfig(java.util.Properties)
049: */
050: public void setConfig(Config config) {
051: super .setConfig(config);
052: File workDir = new File(config.get("work.dir", "work"));
053: String d = config.get("docs.dir", "reuters-out");
054: dataDir = new File(d);
055: if (!dataDir.isAbsolute()) {
056: dataDir = new File(workDir, d);
057: }
058:
059: collectFiles(dataDir, inputFiles);
060: if (inputFiles.size() == 0) {
061: throw new RuntimeException("No txt files in dataDir: "
062: + dataDir.getAbsolutePath());
063: }
064: }
065:
066: // get/initiate a thread-local simple date format (must do so
067: // because SimpleDateFormat is not thread-safe.
068: protected synchronized DateFormat getDateFormat() {
069: DateFormat df = (DateFormat) dateFormat.get();
070: if (df == null) {
071: // date format: 30-MAR-1987 14:22:36.87
072: df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",
073: Locale.US);
074: df.setLenient(true);
075: dateFormat.set(df);
076: }
077: return df;
078: }
079:
080: protected DocData getNextDocData() throws Exception {
081: File f = null;
082: String name = null;
083: synchronized (this ) {
084: if (nextFile >= inputFiles.size()) {
085: // exhausted files, start a new round, unless forever set to false.
086: if (!forever) {
087: throw new NoMoreDataException();
088: }
089: nextFile = 0;
090: iteration++;
091: }
092: f = (File) inputFiles.get(nextFile++);
093: name = f.getCanonicalPath() + "_" + iteration;
094: }
095:
096: BufferedReader reader = new BufferedReader(new FileReader(f));
097: String line = null;
098: //First line is the date, 3rd is the title, rest is body
099: String dateStr = reader.readLine();
100: reader.readLine();//skip an empty line
101: String title = reader.readLine();
102: reader.readLine();//skip an empty line
103: StringBuffer bodyBuf = new StringBuffer(1024);
104: while ((line = reader.readLine()) != null) {
105: bodyBuf.append(line).append(' ');
106: }
107: reader.close();
108:
109: addBytes(f.length());
110:
111: Date date = getDateFormat().parse(dateStr.trim());
112: return new DocData(name, bodyBuf.toString(), title, null, date);
113: }
114:
115: /*
116: * (non-Javadoc)
117: * @see DocMaker#resetIinputs()
118: */
119: public synchronized void resetInputs() {
120: super .resetInputs();
121: nextFile = 0;
122: iteration = 0;
123: }
124:
125: /*
126: * (non-Javadoc)
127: * @see DocMaker#numUniqueTexts()
128: */
129: public int numUniqueTexts() {
130: return inputFiles.size();
131: }
132:
133: }
|