001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.lucene.benchmark.quality;
017:
018: import java.io.PrintWriter;
019: import java.text.NumberFormat;
020: import java.util.ArrayList;
021:
022: /**
023: * Results of quality benchmark run for a single query or for a set of queries.
024: */
025: public class QualityStats {
026:
027: /** Number of points for which precision is computed. */
028: public static final int MAX_POINTS = 20;
029:
030: private double maxGoodPoints;
031: private double recall;
032: private double pAt[];
033: private double pReleventSum = 0;
034: private double numPoints = 0;
035: private double numGoodPoints = 0;
036: private double mrr = 0;
037: private long searchTime;
038: private long docNamesExtractTime;
039:
040: /**
041: * A certain rank in which a relevant doc was found.
042: */
043: public static class RecallPoint {
044: private int rank;
045: private double recall;
046:
047: private RecallPoint(int rank, double recall) {
048: this .rank = rank;
049: this .recall = recall;
050: }
051:
052: /** Returns the rank: where on the list of returned docs this relevant doc appeared. */
053: public int getRank() {
054: return rank;
055: }
056:
057: /** Returns the recall: how many relevant docs were returned up to this point, inclusive. */
058: public double getRecall() {
059: return recall;
060: }
061: }
062:
063: private ArrayList recallPoints;
064:
065: /**
066: * Construct a QualityStats object with anticipated maximal number of relevant hits.
067: * @param maxGoodPoints maximal possible relevant hits.
068: */
069: public QualityStats(double maxGoodPoints, long searchTime) {
070: this .maxGoodPoints = maxGoodPoints;
071: this .searchTime = searchTime;
072: this .recallPoints = new ArrayList();
073: pAt = new double[MAX_POINTS + 1]; // pAt[0] unused.
074: }
075:
076: /**
077: * Add a (possibly relevant) doc.
078: * @param n rank of the added doc (its ordinal position within the query results).
079: * @param isRelevant true if the added doc is relevant, false otherwise.
080: */
081: public void addResult(int n, boolean isRelevant,
082: long docNameExtractTime) {
083: if (Math.abs(numPoints + 1 - n) > 1E-6) {
084: throw new IllegalArgumentException("point " + n
085: + " illegal after " + numPoints + " points!");
086: }
087: if (isRelevant) {
088: numGoodPoints += 1;
089: recallPoints.add(new RecallPoint(n, numGoodPoints));
090: if (recallPoints.size() == 1 && n <= 5) { // first point, but only within 5 top scores.
091: mrr = 1.0 / n;
092: }
093: }
094: numPoints = n;
095: double p = numGoodPoints / numPoints;
096: if (isRelevant) {
097: pReleventSum += p;
098: }
099: if (n < pAt.length) {
100: pAt[n] = p;
101: }
102: recall = maxGoodPoints <= 0 ? p : numGoodPoints / maxGoodPoints;
103: docNamesExtractTime += docNameExtractTime;
104: }
105:
106: /**
107: * Return the precision at rank n:
108: * |{relevant hits within first <code>n</code> hits}| / <code>n</code>.
109: * @param n requested precision point, must be at least 1 and at most {@link #MAX_POINTS}.
110: */
111: public double getPrecisionAt(int n) {
112: if (n < 1 || n > MAX_POINTS) {
113: throw new IllegalArgumentException("n=" + n
114: + " - but it must be in [1," + MAX_POINTS
115: + "] range!");
116: }
117: if (n > numPoints) {
118: return (numPoints * pAt[(int) numPoints]) / n;
119: }
120: return pAt[n];
121: }
122:
123: /**
124: * Return the average precision at recall points.
125: */
126: public double getAvp() {
127: return maxGoodPoints == 0 ? 0 : pReleventSum / maxGoodPoints;
128: }
129:
130: /**
131: * Return the recall: |{relevant hits}| / |{hits}|.
132: */
133: public double getRecall() {
134: return recall;
135: }
136:
137: /**
138: * Log information on this QualityStats object.
139: * @param logger Logger.
140: * @param prefix prefix before each log line.
141: */
142: public void log(String title, int paddLines, PrintWriter logger,
143: String prefix) {
144: for (int i = 0; i < paddLines; i++) {
145: logger.println();
146: }
147: if (title != null && title.trim().length() > 0) {
148: logger.println(title);
149: }
150: prefix = prefix == null ? "" : prefix;
151: NumberFormat nf = NumberFormat.getInstance();
152: nf.setMaximumFractionDigits(3);
153: nf.setMinimumFractionDigits(3);
154: nf.setGroupingUsed(true);
155: int M = 19;
156: logger.println(prefix + format("Search Seconds: ", M)
157: + fracFormat(nf.format((double) searchTime / 1000)));
158: logger.println(prefix
159: + format("DocName Seconds: ", M)
160: + fracFormat(nf
161: .format((double) docNamesExtractTime / 1000)));
162: logger.println(prefix + format("Num Points: ", M)
163: + fracFormat(nf.format(numPoints)));
164: logger.println(prefix + format("Num Good Points: ", M)
165: + fracFormat(nf.format(numGoodPoints)));
166: logger.println(prefix + format("Max Good Points: ", M)
167: + fracFormat(nf.format(maxGoodPoints)));
168: logger.println(prefix + format("Average Precision: ", M)
169: + fracFormat(nf.format(getAvp())));
170: logger.println(prefix + format("MRR: ", M)
171: + fracFormat(nf.format(getMRR())));
172: logger.println(prefix + format("Recall: ", M)
173: + fracFormat(nf.format(getRecall())));
174: for (int i = 1; i < (int) numPoints && i < pAt.length; i++) {
175: logger.println(prefix
176: + format("Precision At " + i + ": ", M)
177: + fracFormat(nf.format(getPrecisionAt(i))));
178: }
179: for (int i = 0; i < paddLines; i++) {
180: logger.println();
181: }
182: }
183:
184: private static String padd = " ";
185:
186: private String format(String s, int minLen) {
187: s = (s == null ? "" : s);
188: int n = Math.max(minLen, s.length());
189: return (s + padd).substring(0, n);
190: }
191:
192: private String fracFormat(String frac) {
193: int k = frac.indexOf('.');
194: String s1 = padd + frac.substring(0, k);
195: int n = Math.max(k, 6);
196: s1 = s1.substring(s1.length() - n);
197: return s1 + frac.substring(k);
198: }
199:
200: /**
201: * Create a QualityStats object that is the average of the input QualityStats objects.
202: * @param stats array of input stats to be averaged.
203: * @return an average over the input stats.
204: */
205: public static QualityStats average(QualityStats[] stats) {
206: QualityStats avg = new QualityStats(0, 0);
207: if (stats.length == 0) {
208: // weired, no stats to average!
209: return avg;
210: }
211: int m = 0; // queries with positive judgements
212: // aggregate
213: for (int i = 0; i < stats.length; i++) {
214: avg.searchTime += stats[i].searchTime;
215: avg.docNamesExtractTime += stats[i].docNamesExtractTime;
216: if (stats[i].maxGoodPoints > 0) {
217: m++;
218: avg.numGoodPoints += stats[i].numGoodPoints;
219: avg.numPoints += stats[i].numPoints;
220: avg.pReleventSum += stats[i].getAvp();
221: avg.recall += stats[i].recall;
222: avg.mrr += stats[i].getMRR();
223: avg.maxGoodPoints += stats[i].maxGoodPoints;
224: for (int j = 1; j < avg.pAt.length; j++) {
225: avg.pAt[j] += stats[i].getPrecisionAt(j);
226: }
227: }
228: }
229: assert m > 0 : "Fishy: no \"good\" queries!";
230: // take average: times go by all queries, other meassures go by "good" queries noly.
231: avg.searchTime /= stats.length;
232: avg.docNamesExtractTime /= stats.length;
233: avg.numGoodPoints /= m;
234: avg.numPoints /= m;
235: avg.recall /= m;
236: avg.mrr /= m;
237: avg.maxGoodPoints /= m;
238: for (int j = 1; j < avg.pAt.length; j++) {
239: avg.pAt[j] /= m;
240: }
241: avg.pReleventSum /= m; // this is actually avgp now
242: avg.pReleventSum *= avg.maxGoodPoints; // so that getAvgP() would be correct
243:
244: return avg;
245: }
246:
247: /**
248: * Returns the time it took to extract doc names for judging the measured query, in milliseconds.
249: */
250: public long getDocNamesExtractTime() {
251: return docNamesExtractTime;
252: }
253:
254: /**
255: * Returns the maximal number of good points.
256: * This is the number of relevant docs known by the judge for the measured query.
257: */
258: public double getMaxGoodPoints() {
259: return maxGoodPoints;
260: }
261:
262: /**
263: * Returns the number of good points (only relevant points).
264: */
265: public double getNumGoodPoints() {
266: return numGoodPoints;
267: }
268:
269: /**
270: * Returns the number of points (both relevant and irrelevant points).
271: */
272: public double getNumPoints() {
273: return numPoints;
274: }
275:
276: /**
277: * Returns the recallPoints.
278: */
279: public RecallPoint[] getRecallPoints() {
280: return (RecallPoint[]) recallPoints.toArray(new RecallPoint[0]);
281: }
282:
283: /**
284: * Returns the Mean reciprocal rank over the queries or RR for a single query.
285: * <p>
286: * Reciprocal rank is defined as <code>1/r</code> where <code>r</code> is the
287: * rank of the first correct result, or <code>0</code> if there are no correct
288: * results within the top 5 results.
289: * <p>
290: * This follows the definition in
291: * <a href="http://www.cnlp.org/publications/02cnlptrec10.pdf">
292: * Question Answering - CNLP at the TREC-10 Question Answering Track</a>.
293: */
294: public double getMRR() {
295: return mrr;
296: }
297:
298: /**
299: * Returns the search time in milliseconds for the measured query.
300: */
301: public long getSearchTime() {
302: return searchTime;
303: }
304:
305: }
|