001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.lucene.benchmark.quality.trec;
017:
018: import java.io.BufferedReader;
019: import java.io.IOException;
020: import java.io.PrintWriter;
021: import java.util.ArrayList;
022: import java.util.HashMap;
023: import java.util.Iterator;
024: import java.util.StringTokenizer;
025:
026: import org.apache.lucene.benchmark.quality.Judge;
027: import org.apache.lucene.benchmark.quality.QualityQuery;
028:
029: /**
030: * Judge if given document is relevant to given quality query, based on Trec format for judgements.
031: */
032: public class TrecJudge implements Judge {
033:
034: HashMap judgements;
035:
036: /**
037: * Constructor from a reader.
038: * <p>
039: * Expected input format:
040: * <pre>
041: * qnum 0 doc-name is-relevant
042: * </pre>
043: * Two sample lines:
044: * <pre>
045: * 19 0 doc303 1
046: * 19 0 doc7295 0
047: * </pre>
048: * @param reader where judgments are read from.
049: * @throws IOException
050: */
051: public TrecJudge(BufferedReader reader) throws IOException {
052: judgements = new HashMap();
053: QRelJudgement curr = null;
054: String zero = "0";
055: String line;
056:
057: try {
058: while (null != (line = reader.readLine())) {
059: line = line.trim();
060: if (line.length() == 0 || '#' == line.charAt(0)) {
061: continue;
062: }
063: StringTokenizer st = new StringTokenizer(line);
064: String queryID = st.nextToken();
065: st.nextToken();
066: String docName = st.nextToken();
067: boolean relevant = !zero.equals(st.nextToken());
068: assert !st.hasMoreTokens() : "wrong format: " + line
069: + " next: " + st.nextToken();
070: if (relevant) { // only keep relevant docs
071: if (curr == null || !curr.queryID.equals(queryID)) {
072: curr = (QRelJudgement) judgements.get(queryID);
073: if (curr == null) {
074: curr = new QRelJudgement(queryID);
075: judgements.put(queryID, curr);
076: }
077: }
078: curr.addRelevandDoc(docName);
079: }
080: }
081: } finally {
082: reader.close();
083: }
084: }
085:
086: // inherit javadocs
087: public boolean isRelevant(String docName, QualityQuery query) {
088: QRelJudgement qrj = (QRelJudgement) judgements.get(query
089: .getQueryID());
090: return qrj != null && qrj.isRelevant(docName);
091: }
092:
093: /** single Judgement of a trec quality query */
094: private static class QRelJudgement {
095: private String queryID;
096: private HashMap relevantDocs;
097:
098: QRelJudgement(String queryID) {
099: this .queryID = queryID;
100: relevantDocs = new HashMap();
101: }
102:
103: public void addRelevandDoc(String docName) {
104: relevantDocs.put(docName, docName);
105: }
106:
107: boolean isRelevant(String docName) {
108: return relevantDocs.containsKey(docName);
109: }
110:
111: public int maxRecall() {
112: return relevantDocs.size();
113: }
114: }
115:
116: // inherit javadocs
117: public boolean validateData(QualityQuery[] qq, PrintWriter logger) {
118: HashMap missingQueries = (HashMap) judgements.clone();
119: ArrayList missingJudgements = new ArrayList();
120: for (int i = 0; i < qq.length; i++) {
121: String id = qq[i].getQueryID();
122: if (missingQueries.containsKey(id)) {
123: missingQueries.remove(id);
124: } else {
125: missingJudgements.add(id);
126: }
127: }
128: boolean isValid = true;
129: if (missingJudgements.size() > 0) {
130: isValid = false;
131: if (logger != null) {
132: logger.println("WARNING: " + missingJudgements.size()
133: + " queries have no judgments! - ");
134: for (int i = 0; i < missingJudgements.size(); i++) {
135: logger.println(" "
136: + (String) missingJudgements.get(i));
137: }
138: }
139: }
140: if (missingQueries.size() > 0) {
141: isValid = false;
142: if (logger != null) {
143: logger.println("WARNING: " + missingQueries.size()
144: + " judgments match no query! - ");
145: for (Iterator it = missingQueries.keySet().iterator(); it
146: .hasNext();) {
147: String id = (String) it.next();
148: logger.println(" " + id);
149: }
150: }
151: }
152: return isValid;
153: }
154:
155: // inherit javadocs
156: public int maxRecall(QualityQuery query) {
157: QRelJudgement qrj = (QRelJudgement) judgements.get(query
158: .getQueryID());
159: if (qrj != null) {
160: return qrj.maxRecall();
161: }
162: return 0;
163: }
164: }
|