001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.lucene.benchmark.quality.utils;
017:
018: import java.io.File;
019: import java.io.IOException;
020:
021: import org.apache.lucene.index.IndexReader;
022: import org.apache.lucene.index.Term;
023: import org.apache.lucene.index.TermEnum;
024: import org.apache.lucene.store.Directory;
025: import org.apache.lucene.store.FSDirectory;
026: import org.apache.lucene.util.PriorityQueue;
027:
028: /**
029: * Suggest Quality queries based on an index contents.
030: * Utility class, used for making quality test benchmarks.
031: */
032: public class QualityQueriesFinder {
033:
034: private static final String newline = System
035: .getProperty("line.separator");
036: private Directory dir;
037:
038: /**
039: * Constrctor over a directory containing the index.
040: * @param dir directory containing the index we search for the quality test.
041: */
042: private QualityQueriesFinder(Directory dir) {
043: this .dir = dir;
044: }
045:
046: /**
047: * @param args {index-dir}
048: * @throws IOException if cannot access the index.
049: */
050: public static void main(String[] args) throws IOException {
051: if (args.length < 1) {
052: System.err
053: .println("Usage: java QualityQueriesFinder <index-dir>");
054: System.exit(1);
055: }
056: QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory
057: .getDirectory(new File(args[0])));
058: String q[] = qqf.bestQueries("body", 20);
059: for (int i = 0; i < q.length; i++) {
060: System.out.println(newline
061: + formatQueryAsTrecTopic(i, q[i], null, null));
062: }
063: }
064:
065: private String[] bestQueries(String field, int numQueries)
066: throws IOException {
067: String words[] = bestTerms("body", 4 * numQueries);
068: int n = words.length;
069: int m = n / 4;
070: String res[] = new String[m];
071: for (int i = 0; i < res.length; i++) {
072: res[i] = words[i] + " " + words[m + i] + " "
073: + words[n - 1 - m - i] + " " + words[n - 1 - i];
074: //System.out.println("query["+i+"]: "+res[i]);
075: }
076: return res;
077: }
078:
079: private static String formatQueryAsTrecTopic(int qnum,
080: String title, String description, String narrative) {
081: return "<top>" + newline + "<num> Number: " + qnum + newline
082: + newline + "<title> " + (title == null ? "" : title)
083: + newline + newline + "<desc> Description:" + newline
084: + (description == null ? "" : description) + newline
085: + newline + "<narr> Narrative:" + newline
086: + (narrative == null ? "" : narrative) + newline
087: + newline + "</top>";
088: }
089:
090: private String[] bestTerms(String field, int numTerms)
091: throws IOException {
092: PriorityQueue pq = new TermsDfQueue(numTerms);
093: IndexReader ir = IndexReader.open(dir);
094: try {
095: int threshold = ir.maxDoc() / 10; // ignore words too common.
096: TermEnum terms = ir.terms(new Term(field, ""));
097: while (terms.next()) {
098: if (!field.equals(terms.term().field())) {
099: break;
100: }
101: int df = terms.docFreq();
102: if (df < threshold) {
103: String ttxt = terms.term().text();
104: pq.insert(new TermDf(ttxt, df));
105: }
106: }
107: } finally {
108: ir.close();
109: }
110: String res[] = new String[pq.size()];
111: int i = 0;
112: while (pq.size() > 0) {
113: TermDf tdf = (TermDf) pq.pop();
114: res[i++] = tdf.word;
115: System.out.println(i + ". word: " + tdf.df + " "
116: + tdf.word);
117: }
118: return res;
119: }
120:
121: private static class TermDf {
122: String word;
123: int df;
124:
125: TermDf(String word, int freq) {
126: this .word = word;
127: this .df = freq;
128: }
129: }
130:
131: private static class TermsDfQueue extends PriorityQueue {
132: TermsDfQueue(int maxSize) {
133: initialize(maxSize);
134: }
135:
136: protected boolean lessThan(Object a, Object b) {
137: TermDf tf1 = (TermDf) a;
138: TermDf tf2 = (TermDf) b;
139: return tf1.df < tf2.df;
140: }
141: }
142:
143: }
|