001: package org.apache.lucene.benchmark.byTask.feeds;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.BufferedInputStream;
021: import java.io.BufferedReader;
022: import java.io.File;
023: import java.io.FileInputStream;
024: import java.io.IOException;
025: import java.io.InputStreamReader;
026: import java.text.DateFormat;
027: import java.text.ParseException;
028: import java.text.SimpleDateFormat;
029: import java.util.ArrayList;
030: import java.util.Date;
031: import java.util.Locale;
032: import java.util.zip.GZIPInputStream;
033:
034: import org.apache.lucene.benchmark.byTask.utils.Config;
035:
036: /**
037: * A DocMaker using the (compressed) Trec collection for its input.
038: * <p>
039: * Config properties:<ul>
040: * <li>work.dir=<path to the root of docs and indexes dirs| Default: work></li>
041: * <li>docs.dir=<path to the docs dir| Default: trec></li>
042: * </ul>
043: */
044: public class TrecDocMaker extends BasicDocMaker {
045:
046: private static final String newline = System
047: .getProperty("line.separator");
048:
049: private ThreadLocal dateFormat = new ThreadLocal();
050: private File dataDir = null;
051: private ArrayList inputFiles = new ArrayList();
052: private int nextFile = 0;
053: private int iteration = 0;
054: private BufferedReader reader;
055: private GZIPInputStream zis;
056:
057: private static final String DATE_FORMATS[] = {
058: "EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
059: "EEE MMM dd kk:mm:ss yyyy z", //Tue Dec 09 16:45:08 2003 EST
060: "EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
061: "EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
062: };
063:
064: /* (non-Javadoc)
065: * @see SimpleDocMaker#setConfig(java.util.Properties)
066: */
067: public void setConfig(Config config) {
068: super .setConfig(config);
069: File workDir = new File(config.get("work.dir", "work"));
070: String d = config.get("docs.dir", "trec");
071: dataDir = new File(d);
072: if (!dataDir.isAbsolute()) {
073: dataDir = new File(workDir, d);
074: }
075: collectFiles(dataDir, inputFiles);
076: if (inputFiles.size() == 0) {
077: throw new RuntimeException("No txt files in dataDir: "
078: + dataDir.getAbsolutePath());
079: }
080: }
081:
082: private void openNextFile() throws NoMoreDataException, Exception {
083: closeInputs();
084: int retries = 0;
085: while (true) {
086: File f = null;
087: synchronized (this ) {
088: if (nextFile >= inputFiles.size()) {
089: // exhausted files, start a new round, unless forever set to false.
090: if (!forever) {
091: throw new NoMoreDataException();
092: }
093: nextFile = 0;
094: iteration++;
095: }
096: f = (File) inputFiles.get(nextFile++);
097: }
098: System.out.println("opening: " + f + " length: "
099: + f.length());
100: try {
101: zis = new GZIPInputStream(new BufferedInputStream(
102: new FileInputStream(f)));
103: reader = new BufferedReader(new InputStreamReader(zis));
104: return;
105: } catch (Exception e) {
106: retries++;
107: if (retries < 20) {
108: System.out.println("Skipping 'bad' file "
109: + f.getAbsolutePath() + " #retries="
110: + retries);
111: continue;
112: } else {
113: throw new NoMoreDataException();
114: }
115: }
116: }
117: }
118:
119: private void closeInputs() {
120: if (zis != null) {
121: try {
122: zis.close();
123: } catch (IOException e) {
124: System.out.println("closeInputs(): Ingnoring error: "
125: + e);
126: e.printStackTrace();
127: }
128: zis = null;
129: }
130: if (reader != null) {
131: try {
132: reader.close();
133: } catch (IOException e) {
134: System.out.println("closeInputs(): Ingnoring error: "
135: + e);
136: e.printStackTrace();
137: }
138: reader = null;
139: }
140: }
141:
142: // read until finding a line that starts with the specified prefix
143: private StringBuffer read(String prefix, StringBuffer sb,
144: boolean collectMatchLine, boolean collectAll)
145: throws Exception {
146: sb = (sb == null ? new StringBuffer() : sb);
147: String sep = "";
148: while (true) {
149: String line = reader.readLine();
150: if (line == null) {
151: openNextFile();
152: continue;
153: }
154: if (line.startsWith(prefix)) {
155: if (collectMatchLine) {
156: sb.append(sep + line);
157: sep = newline;
158: }
159: break;
160: }
161: if (collectAll) {
162: sb.append(sep + line);
163: sep = newline;
164: }
165: }
166: //System.out.println("read: "+sb);
167: return sb;
168: }
169:
170: protected synchronized DocData getNextDocData()
171: throws NoMoreDataException, Exception {
172: if (reader == null) {
173: openNextFile();
174: }
175: // 1. skip until doc start
176: read("<DOC>", null, false, false);
177: // 2. name
178: StringBuffer sb = read("<DOCNO>", null, true, false);
179: String name = sb.substring("<DOCNO>".length());
180: name = name.substring(0, name.indexOf("</DOCNO>")) + "_"
181: + iteration;
182: // 3. skip until doc header
183: read("<DOCHDR>", null, false, false);
184: // 4. date
185: sb = read("Date: ", null, true, false);
186: String dateStr = sb.substring("Date: ".length());
187: // 5. skip until end of doc header
188: read("</DOCHDR>", null, false, false);
189: // 6. collect until end of doc
190: sb = read("</DOC>", null, false, true);
191: // this is the next document, so parse it
192: Date date = parseDate(dateStr);
193: HTMLParser p = getHtmlParser();
194: DocData docData = p.parse(name, date, sb, getDateFormat(0));
195: addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).
196:
197: return docData;
198: }
199:
200: private DateFormat getDateFormat(int n) {
201: DateFormat df[] = (DateFormat[]) dateFormat.get();
202: if (df == null) {
203: df = new SimpleDateFormat[DATE_FORMATS.length];
204: for (int i = 0; i < df.length; i++) {
205: df[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.US);
206: df[i].setLenient(true);
207: }
208: dateFormat.set(df);
209: }
210: return df[n];
211: }
212:
213: private Date parseDate(String dateStr) {
214: Date date = null;
215: for (int i = 0; i < DATE_FORMATS.length; i++) {
216: try {
217: date = getDateFormat(i).parse(dateStr.trim());
218: return date;
219: } catch (ParseException e) {
220: }
221: }
222: // do not fail test just because a date could not be parsed
223: System.out
224: .println("ignoring date parse exception (assigning 'now') for: "
225: + dateStr);
226: date = new Date(); // now
227: return date;
228: }
229:
230: /*
231: * (non-Javadoc)
232: * @see DocMaker#resetIinputs()
233: */
234: public synchronized void resetInputs() {
235: super .resetInputs();
236: closeInputs();
237: nextFile = 0;
238: iteration = 0;
239: }
240:
241: /*
242: * (non-Javadoc)
243: * @see DocMaker#numUniqueTexts()
244: */
245: public int numUniqueTexts() {
246: return inputFiles.size();
247: }
248:
249: }
|