001: package org.apache.lucene.benchmark.byTask.feeds;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.benchmark.byTask.utils.Config;
021:
022: import java.io.BufferedReader;
023: import java.io.File;
024: import java.io.FileFilter;
025: import java.io.FileReader;
026: import java.text.DateFormat;
027: import java.text.SimpleDateFormat;
028: import java.util.Arrays;
029: import java.util.Date;
030: import java.util.Locale;
031: import java.util.Stack;
032:
033: /**
034: * A DocMaker using the Dir collection for its input.
035: *
036: * Config properties:
037: * docs.dir=<path to the docs dir| Default: dir-out>
038:
039: *
040: */
041: public class DirDocMaker extends BasicDocMaker {
042:
043: private ThreadLocal dateFormat = new ThreadLocal();
044: private File dataDir = null;
045: private int iteration = 0;
046:
047: static public class Iterator implements java.util.Iterator {
048:
049: int count = 0;
050:
051: public int getCount() {
052: return count;
053: }
054:
055: Stack stack = new Stack();
056:
057: /* this seems silly ... there must be a better way ...
058: not that this is good, but can it matter? */
059:
060: static class Comparator implements java.util.Comparator {
061: public int compare(Object _a, Object _b) {
062: String a = _a.toString();
063: String b = _b.toString();
064:
065: int diff = a.length() - b.length();
066:
067: if (diff > 0) {
068: while (diff-- > 0) {
069: b = "0" + b;
070: }
071: } else if (diff < 0) {
072: diff = -diff;
073: while (diff-- > 0) {
074: a = "0" + a;
075: }
076: }
077:
078: /* note it's reversed because we're going to push,
079: which reverses again */
080: return b.compareTo(a);
081: }
082: }
083:
084: Comparator c = new Comparator();
085:
086: void push(File[] files) {
087: Arrays.sort(files, c);
088: for (int i = 0; i < files.length; i++) {
089: // System.err.println("push " + files[i]);
090: stack.push(files[i]);
091: }
092: }
093:
094: void push(File f) {
095: push(f.listFiles(new FileFilter() {
096: public boolean accept(File f) {
097: return f.isDirectory();
098: }
099: }));
100: push(f.listFiles(new FileFilter() {
101: public boolean accept(File f) {
102: return f.getName().endsWith(".txt");
103: }
104: }));
105: find();
106: }
107:
108: void find() {
109: if (stack.empty()) {
110: return;
111: }
112: if (!((File) stack.peek()).isDirectory()) {
113: return;
114: }
115: File f = (File) stack.pop();
116: push(f);
117: }
118:
119: public Iterator(File f) {
120: push(f);
121: }
122:
123: public void remove() {
124: throw new RuntimeException("cannot");
125: }
126:
127: public boolean hasNext() {
128: return stack.size() > 0;
129: }
130:
131: public Object next() {
132: assert hasNext();
133: count++;
134: Object object = stack.pop();
135: // System.err.println("pop " + object);
136: find();
137: return object;
138: }
139:
140: }
141:
142: private Iterator inputFiles = null;
143:
144: /* (non-Javadoc)
145: * @see SimpleDocMaker#setConfig(java.util.Properties)
146: */
147: public void setConfig(Config config) {
148: super .setConfig(config);
149: String d = config.get("docs.dir", "dir-out");
150: dataDir = new File(d);
151: if (!dataDir.isAbsolute()) {
152: dataDir = new File(new File("work"), d);
153: }
154:
155: inputFiles = new Iterator(dataDir);
156:
157: if (inputFiles == null) {
158: throw new RuntimeException("No txt files in dataDir: "
159: + dataDir.getAbsolutePath());
160: }
161: }
162:
163: // get/initiate a thread-local simple date format (must do so
164: // because SimpleDateFormat is not thread-safe).
165: protected DateFormat getDateFormat() {
166: DateFormat df = (DateFormat) dateFormat.get();
167: if (df == null) {
168: // date format: 30-MAR-1987 14:22:36.87
169: df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",
170: Locale.US);
171: df.setLenient(true);
172: dateFormat.set(df);
173: }
174: return df;
175: }
176:
177: protected DocData getNextDocData() throws Exception {
178: File f = null;
179: String name = null;
180: synchronized (this ) {
181: if (!inputFiles.hasNext()) {
182: // exhausted files, start a new round, unless forever set to false.
183: if (!forever) {
184: throw new NoMoreDataException();
185: }
186: inputFiles = new Iterator(dataDir);
187: iteration++;
188: }
189: f = (File) inputFiles.next();
190: // System.err.println(f);
191: name = f.getCanonicalPath() + "_" + iteration;
192: }
193:
194: BufferedReader reader = new BufferedReader(new FileReader(f));
195: String line = null;
196: //First line is the date, 3rd is the title, rest is body
197: String dateStr = reader.readLine();
198: reader.readLine();//skip an empty line
199: String title = reader.readLine();
200: reader.readLine();//skip an empty line
201: StringBuffer bodyBuf = new StringBuffer(1024);
202: while ((line = reader.readLine()) != null) {
203: bodyBuf.append(line).append(' ');
204: }
205: reader.close();
206: addBytes(f.length());
207:
208: Date date = getDateFormat().parse(dateStr.trim());
209: return new DocData(name, bodyBuf.toString(), title, null, date);
210: }
211:
212: /*
213: * (non-Javadoc)
214: * @see DocMaker#resetIinputs()
215: */
216: public synchronized void resetInputs() {
217: super .resetInputs();
218: inputFiles = new Iterator(dataDir);
219: iteration = 0;
220: }
221:
222: /*
223: * (non-Javadoc)
224: * @see DocMaker#numUniqueTexts()
225: */
226: public int numUniqueTexts() {
227: return inputFiles.getCount();
228: }
229:
230: }
|