001: package org.apache.lucene.benchmark.byTask.tasks;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.benchmark.byTask.PerfRunData;
021: import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
022: import org.apache.lucene.analysis.Token;
023: import org.apache.lucene.analysis.Analyzer;
024: import org.apache.lucene.analysis.TokenStream;
025: import org.apache.lucene.document.Document;
026: import org.apache.lucene.document.Field;
027: import java.text.NumberFormat;
028: import java.io.Reader;
029: import java.util.List;
030:
031: /**
032: * Simple task to test performance of tokenizers. It just
033: * creates a token stream for each field of the document and
034: * read all tokens out of that stream.
035: * <br>Relevant properties: <code>doc.tokenize.log.step</code>.
036: */
037: public class ReadTokensTask extends PerfTask {
038:
039: /**
040: * Default value for property <code>doc.tokenize.log.step<code> - indicating how often
041: * an "added N docs / M tokens" message should be logged.
042: */
043: public static final int DEFAULT_DOC_LOG_STEP = 500;
044:
045: public ReadTokensTask(PerfRunData runData) {
046: super (runData);
047: }
048:
049: private int logStep = -1;
050: int count = 0;
051: int totalTokenCount = 0;
052:
053: // volatile data passed between setup(), doLogic(), tearDown().
054: private Document doc = null;
055:
056: /*
057: * (non-Javadoc)
058: * @see PerfTask#setup()
059: */
060: public void setup() throws Exception {
061: super .setup();
062: DocMaker docMaker = getRunData().getDocMaker();
063: doc = docMaker.makeDocument();
064: }
065:
066: /* (non-Javadoc)
067: * @see PerfTask#tearDown()
068: */
069: public void tearDown() throws Exception {
070: log(++count);
071: doc = null;
072: super .tearDown();
073: }
074:
075: Token token = new Token();
076:
077: public int doLogic() throws Exception {
078: List fields = doc.getFields();
079: final int numField = fields.size();
080: Analyzer analyzer = getRunData().getAnalyzer();
081: int tokenCount = 0;
082: for (int i = 0; i < numField; i++) {
083: final Field field = (Field) fields.get(i);
084: final TokenStream stream;
085: final TokenStream streamValue = field.tokenStreamValue();
086:
087: if (streamValue != null)
088: stream = streamValue;
089: else {
090: // the field does not have a TokenStream,
091: // so we have to obtain one from the analyzer
092: final Reader reader; // find or make Reader
093: final Reader readerValue = field.readerValue();
094:
095: if (readerValue != null)
096: reader = readerValue;
097: else {
098: String stringValue = field.stringValue();
099: if (stringValue == null)
100: throw new IllegalArgumentException(
101: "field must have either TokenStream, String or Reader value");
102: stringReader.init(stringValue);
103: reader = stringReader;
104: }
105:
106: // Tokenize field
107: stream = analyzer.reusableTokenStream(field.name(),
108: reader);
109: }
110:
111: // reset the TokenStream to the first token
112: stream.reset();
113:
114: while (stream.next(token) != null)
115: tokenCount++;
116: }
117: totalTokenCount += tokenCount;
118: return tokenCount;
119: }
120:
121: private void log(int count) {
122: if (logStep < 0) {
123: // init once per instance
124: logStep = getRunData().getConfig().get(
125: "doc.tokenize.log.step", DEFAULT_DOC_LOG_STEP);
126: }
127: if (logStep > 0 && (count % logStep) == 0) {
128: double seconds = (System.currentTimeMillis() - getRunData()
129: .getStartTimeMillis()) / 1000.0;
130: NumberFormat nf = NumberFormat.getInstance();
131: nf.setMaximumFractionDigits(2);
132: System.out.println("--> " + nf.format(seconds) + " sec: "
133: + Thread.currentThread().getName()
134: + " processed (add) " + count + " docs" + "; "
135: + totalTokenCount + " tokens");
136: }
137: }
138:
139: /* Simple StringReader that can be reset to a new string;
140: * we use this when tokenizing the string value from a
141: * Field. */
142: ReusableStringReader stringReader = new ReusableStringReader();
143:
144: private final static class ReusableStringReader extends Reader {
145: int upto;
146: int left;
147: String s;
148:
149: void init(String s) {
150: this .s = s;
151: left = s.length();
152: this .upto = 0;
153: }
154:
155: public int read(char[] c) {
156: return read(c, 0, c.length);
157: }
158:
159: public int read(char[] c, int off, int len) {
160: if (left > len) {
161: s.getChars(upto, upto + len, c, off);
162: upto += len;
163: left -= len;
164: return len;
165: } else if (0 == left) {
166: return -1;
167: } else {
168: s.getChars(upto, upto + left, c, off);
169: int r = left;
170: left = 0;
171: upto = s.length();
172: return r;
173: }
174: }
175:
176: public void close() {
177: };
178: }
179: }
|