001: package org.apache.lucene.benchmark.byTask.feeds;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.benchmark.byTask.utils.Config;
021: import org.apache.lucene.benchmark.byTask.utils.Format;
022: import org.apache.lucene.document.DateTools;
023: import org.apache.lucene.document.Document;
024: import org.apache.lucene.document.Field;
025:
026: import java.io.File;
027: import java.io.UnsupportedEncodingException;
028: import java.util.ArrayList;
029: import java.util.Arrays;
030: import java.util.Iterator;
031:
032: /**
033: * Create documents for the test.
034: * Maintains counters of chars etc. so that sub-classes just need to
035: * provide textual content, and the create-by-size is handled here.
036: *
037: * <p/>
038: * Config Params (default is in caps):
039: * doc.stored=true|FALSE<br/>
040: * doc.tokenized=TRUE|false<br/>
041: * doc.term.vector=true|FALSE<br/>
042: * doc.term.vector.positions=true|FALSE<br/>
043: * doc.term.vector.offsets=true|FALSE<br/>
044: * doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field<br/>
045: */
046: public abstract class BasicDocMaker implements DocMaker {
047:
048: private int numDocsCreated = 0;
049: private boolean storeBytes = false;
050: protected boolean forever;
051:
052: private static class LeftOver {
053: private DocData docdata;
054: private int cnt;
055: }
056:
057: // leftovers are thread local, because it is unsafe to share residues between threads
058: private ThreadLocal leftovr = new ThreadLocal();
059:
060: public static final String BODY_FIELD = "body";
061: public static final String TITLE_FIELD = "doctitle";
062: public static final String DATE_FIELD = "docdate";
063: public static final String ID_FIELD = "docid";
064: public static final String BYTES_FIELD = "bytes";
065: public static final String NAME_FIELD = "docname";
066:
067: private long numBytes = 0;
068: private long numUniqueBytes = 0;
069:
070: protected Config config;
071:
072: protected Field.Store storeVal = Field.Store.NO;
073: protected Field.Index indexVal = Field.Index.TOKENIZED;
074: protected Field.TermVector termVecVal = Field.TermVector.NO;
075:
076: private synchronized int incrNumDocsCreated() {
077: return numDocsCreated++;
078: }
079:
080: /**
081: * Return the data of the next document.
082: * All current implementations can create docs forever.
083: * When the input data is exhausted, input files are iterated.
084: * This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
085: * @return data of the next document.
086: * @exception if cannot create the next doc data
087: * @exception NoMoreDataException if data is exhausted (and 'forever' set to false).
088: */
089: protected abstract DocData getNextDocData()
090: throws NoMoreDataException, Exception;
091:
092: /*
093: * (non-Javadoc)
094: * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument()
095: */
096: public Document makeDocument() throws Exception {
097: resetLeftovers();
098: DocData docData = getNextDocData();
099: Document doc = createDocument(docData, 0, -1);
100: return doc;
101: }
102:
103: // create a doc
104: // use only part of the body, modify it to keep the rest (or use all if size==0).
105: // reset the docdata properties so they are not added more than once.
106: private Document createDocument(DocData docData, int size, int cnt)
107: throws UnsupportedEncodingException {
108: int docid = incrNumDocsCreated();
109: Document doc = new Document();
110: doc.add(new Field(ID_FIELD, "doc" + docid, storeVal, indexVal,
111: termVecVal));
112: if (docData.getName() != null) {
113: String name = (cnt < 0 ? docData.getName() : docData
114: .getName()
115: + "_" + cnt);
116: doc.add(new Field(NAME_FIELD, name, storeVal, indexVal,
117: termVecVal));
118: }
119: if (docData.getDate() != null) {
120: String dateStr = DateTools.dateToString(docData.getDate(),
121: DateTools.Resolution.SECOND);
122: doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal,
123: termVecVal));
124: }
125: if (docData.getTitle() != null) {
126: doc.add(new Field(TITLE_FIELD, docData.getTitle(),
127: storeVal, indexVal, termVecVal));
128: }
129: if (docData.getBody() != null && docData.getBody().length() > 0) {
130: String bdy;
131: if (size <= 0 || size >= docData.getBody().length()) {
132: bdy = docData.getBody(); // use all
133: docData.setBody(""); // nothing left
134: } else {
135: // attempt not to break words - if whitespace found within next 20 chars...
136: for (int n = size - 1; n < size + 20
137: && n < docData.getBody().length(); n++) {
138: if (Character.isWhitespace(docData.getBody()
139: .charAt(n))) {
140: size = n;
141: break;
142: }
143: }
144: bdy = docData.getBody().substring(0, size); // use part
145: docData.setBody(docData.getBody().substring(size)); // some left
146: }
147: doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal,
148: termVecVal));
149: if (storeBytes == true) {
150: doc.add(new Field(BYTES_FIELD, bdy.getBytes("UTF-8"),
151: Field.Store.YES));
152: }
153: }
154:
155: if (docData.getProps() != null) {
156: for (Iterator it = docData.getProps().keySet().iterator(); it
157: .hasNext();) {
158: String key = (String) it.next();
159: String val = (String) docData.getProps().get(key);
160: doc.add(new Field(key, val, storeVal, indexVal,
161: termVecVal));
162: }
163: docData.setProps(null);
164: }
165: //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
166: return doc;
167: }
168:
169: /*
170: * (non-Javadoc)
171: * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int)
172: */
173: public Document makeDocument(int size) throws Exception {
174: LeftOver lvr = (LeftOver) leftovr.get();
175: if (lvr == null || lvr.docdata == null
176: || lvr.docdata.getBody() == null
177: || lvr.docdata.getBody().length() == 0) {
178: resetLeftovers();
179: }
180: DocData dd = (lvr == null ? getNextDocData() : lvr.docdata);
181: int cnt = (lvr == null ? 0 : lvr.cnt);
182: while (dd.getBody() == null || dd.getBody().length() < size) {
183: DocData dd2 = dd;
184: dd = getNextDocData();
185: cnt = 0;
186: dd.setBody(dd2.getBody() + dd.getBody());
187: }
188: Document doc = createDocument(dd, size, cnt);
189: if (dd.getBody() == null || dd.getBody().length() == 0) {
190: resetLeftovers();
191: } else {
192: if (lvr == null) {
193: lvr = new LeftOver();
194: leftovr.set(lvr);
195: }
196: lvr.docdata = dd;
197: lvr.cnt = ++cnt;
198: }
199: return doc;
200: }
201:
202: private void resetLeftovers() {
203: leftovr.set(null);
204: }
205:
206: /* (non-Javadoc)
207: * @see DocMaker#setConfig(java.util.Properties)
208: */
209: public void setConfig(Config config) {
210: this .config = config;
211: boolean stored = config.get("doc.stored", false);
212: boolean tokenized = config.get("doc.tokenized", true);
213: boolean termVec = config.get("doc.term.vector", false);
214: storeVal = (stored ? Field.Store.YES : Field.Store.NO);
215: indexVal = (tokenized ? Field.Index.TOKENIZED
216: : Field.Index.UN_TOKENIZED);
217: boolean termVecPositions = config.get(
218: "doc.term.vector.positions", false);
219: boolean termVecOffsets = config.get("doc.term.vector.offsets",
220: false);
221: if (termVecPositions && termVecOffsets)
222: termVecVal = Field.TermVector.WITH_POSITIONS_OFFSETS;
223: else if (termVecPositions)
224: termVecVal = Field.TermVector.WITH_POSITIONS;
225: else if (termVecOffsets)
226: termVecVal = Field.TermVector.WITH_OFFSETS;
227: else if (termVec)
228: termVecVal = Field.TermVector.YES;
229: else
230: termVecVal = Field.TermVector.NO;
231: storeBytes = config.get("doc.store.body.bytes", false);
232: forever = config.get("doc.maker.forever", true);
233: }
234:
235: /*
236: * (non-Javadoc)
237: * @see DocMaker#resetIinputs()
238: */
239: public synchronized void resetInputs() {
240: printDocStatistics();
241: numBytes = 0;
242: numDocsCreated = 0;
243: resetLeftovers();
244: }
245:
246: /*
247: * (non-Javadoc)
248: * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#numUniqueBytes()
249: */
250: public long numUniqueBytes() {
251: return numUniqueBytes;
252: }
253:
254: /*
255: * (non-Javadoc)
256: * @see DocMaker#getCount()
257: */
258: public synchronized int getCount() {
259: return numDocsCreated;
260: }
261:
262: /*
263: * (non-Javadoc)
264: * @see DocMaker#getByteCount()
265: */
266: public synchronized long getByteCount() {
267: return numBytes;
268: }
269:
270: protected void addUniqueBytes(long n) {
271: numUniqueBytes += n;
272: }
273:
274: protected synchronized void addBytes(long n) {
275: numBytes += n;
276: }
277:
278: /*
279: * (non-Javadoc)
280: * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#printDocStatistics()
281: */
282: private int lastPrintedNumUniqueTexts = 0;
283: private long lastPrintedNumUniqueBytes = 0;
284: private int printNum = 0;
285: private HTMLParser htmlParser;
286:
287: public void printDocStatistics() {
288: boolean print = false;
289: String col = " ";
290: StringBuffer sb = new StringBuffer();
291: String newline = System.getProperty("line.separator");
292: sb.append("------------> ").append(
293: Format.simpleName(getClass())).append(" statistics (")
294: .append(printNum).append("): ").append(newline);
295: int nut = numUniqueTexts();
296: if (nut > lastPrintedNumUniqueTexts) {
297: print = true;
298: sb.append("total count of unique texts: ").append(
299: Format.format(0, nut, col)).append(newline);
300: lastPrintedNumUniqueTexts = nut;
301: }
302: long nub = numUniqueBytes();
303: if (nub > lastPrintedNumUniqueBytes) {
304: print = true;
305: sb.append("total bytes of unique texts: ").append(
306: Format.format(0, nub, col)).append(newline);
307: lastPrintedNumUniqueBytes = nub;
308: }
309: if (getCount() > 0) {
310: print = true;
311: sb.append("num docs added since last inputs reset: ")
312: .append(Format.format(0, getCount(), col)).append(
313: newline);
314: sb.append("total bytes added since last inputs reset: ")
315: .append(Format.format(0, getByteCount(), col))
316: .append(newline);
317: }
318: if (print) {
319: System.out.println(sb.append(newline).toString());
320: printNum++;
321: }
322: }
323:
324: protected void collectFiles(File f, ArrayList inputFiles) {
325: //System.out.println("Collect: "+f.getAbsolutePath());
326: if (!f.canRead()) {
327: return;
328: }
329: if (f.isDirectory()) {
330: String files[] = f.list();
331: Arrays.sort(files);
332: for (int i = 0; i < files.length; i++) {
333: collectFiles(new File(f, files[i]), inputFiles);
334: }
335: return;
336: }
337: inputFiles.add(f);
338: addUniqueBytes(f.length());
339: }
340:
341: /* (non-Javadoc)
342: * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#setHTMLParser(org.apache.lucene.benchmark.byTask.feeds.HTMLParser)
343: */
344: public void setHTMLParser(HTMLParser htmlParser) {
345: this .htmlParser = htmlParser;
346: }
347:
348: /*
349: * (non-Javadoc)
350: * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#getHtmlParser()
351: */
352: public HTMLParser getHtmlParser() {
353: return htmlParser;
354: }
355:
356: }
|