001: package org.apache.lucene.search;
002:
003: /**
004: * Copyright 2004 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import org.apache.lucene.index.IndexReader;
020:
021: import java.io.IOException;
022: import java.util.ArrayList;
023: import java.util.Iterator;
024: import java.util.Collection;
025: import java.util.Set;
026:
027: /**
028: * A query that generates the union of documents produced by its subqueries, and that scores each document with the maximum
029: * score for that document as produced by any subquery, plus a tie breaking increment for any additional matching subqueries.
030: * This is useful when searching for a word in multiple fields with different boost factors (so that the fields cannot be
031: * combined equivalently into a single search field). We want the primary score to be the one associated with the highest boost,
032: * not the sum of the field scores (as BooleanQuery would give).
033: * If the query is "albino elephant" this ensures that "albino" matching one field and "elephant" matching
034: * another gets a higher score than "albino" matching both fields.
035: * To get this result, use both BooleanQuery and DisjunctionMaxQuery: for each term a DisjunctionMaxQuery searches for it in
036: * each field, while the set of these DisjunctionMaxQuery's is combined into a BooleanQuery.
037: * The tie breaker capability allows results that include the same term in multiple fields to be judged better than results that
038: * include this term in only the best of those multiple fields, without confusing this with the better case of two different terms
039: * in the multiple fields.
040: * @author Chuck Williams
041: */
042: public class DisjunctionMaxQuery extends Query {
043:
044: /* The subqueries */
045: private ArrayList disjuncts = new ArrayList();
046:
047: /* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */
048: private float tieBreakerMultiplier = 0.0f;
049:
050: /** Creates a new empty DisjunctionMaxQuery. Use add() to add the subqueries.
051: * @param tieBreakerMultiplier this score of each non-maximum disjunct for a document is multiplied by this weight
052: * and added into the final score. If non-zero, the value should be small, on the order of 0.1, which says that
053: * 10 occurrences of word in a lower-scored field that is also in a higher scored field is just as good as a unique
054: * word in the lower scored field (i.e., one that is not in any higher scored field.
055: */
056: public DisjunctionMaxQuery(float tieBreakerMultiplier) {
057: this .tieBreakerMultiplier = tieBreakerMultiplier;
058: }
059:
060: /**
061: * Creates a new DisjunctionMaxQuery
062: * @param disjuncts a Collection<Query> of all the disjuncts to add
063: * @param tieBreakerMultiplier the weight to give to each matching non-maximum disjunct
064: */
065: public DisjunctionMaxQuery(Collection disjuncts,
066: float tieBreakerMultiplier) {
067: this .tieBreakerMultiplier = tieBreakerMultiplier;
068: add(disjuncts);
069: }
070:
071: /** Add a subquery to this disjunction
072: * @param query the disjunct added
073: */
074: public void add(Query query) {
075: disjuncts.add(query);
076: }
077:
078: /** Add a collection of disjuncts to this disjunction
079: * via Iterable<Query>
080: */
081: public void add(Collection disjuncts) {
082: this .disjuncts.addAll(disjuncts);
083: }
084:
085: /** An Iterator<Query> over the disjuncts */
086: public Iterator iterator() {
087: return disjuncts.iterator();
088: }
089:
090: /* The Weight for DisjunctionMaxQuery's, used to normalize, score and explain these queries */
091: private class DisjunctionMaxWeight implements Weight {
092:
093: private Similarity similarity; // The similarity which we are associated.
094: private ArrayList weights = new ArrayList(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts
095:
096: /* Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */
097: public DisjunctionMaxWeight(Searcher searcher)
098: throws IOException {
099: this .similarity = searcher.getSimilarity();
100: for (int i = 0; i < disjuncts.size(); i++)
101: weights.add(((Query) disjuncts.get(i))
102: .createWeight(searcher));
103: }
104:
105: /* Return our associated DisjunctionMaxQuery */
106: public Query getQuery() {
107: return DisjunctionMaxQuery.this ;
108: }
109:
110: /* Return our boost */
111: public float getValue() {
112: return getBoost();
113: }
114:
115: /* Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */
116: public float sumOfSquaredWeights() throws IOException {
117: float max = 0.0f, sum = 0.0f;
118: for (int i = 0; i < weights.size(); i++) {
119: float sub = ((Weight) weights.get(i))
120: .sumOfSquaredWeights();
121: sum += sub;
122: max = Math.max(max, sub);
123: }
124: return (((sum - max) * tieBreakerMultiplier * tieBreakerMultiplier) + max)
125: * getBoost() * getBoost();
126: }
127:
128: /* Apply the computed normalization factor to our subqueries */
129: public void normalize(float norm) {
130: norm *= getBoost(); // Incorporate our boost
131: for (int i = 0; i < weights.size(); i++)
132: ((Weight) weights.get(i)).normalize(norm);
133: }
134:
135: /* Create the scorer used to score our associated DisjunctionMaxQuery */
136: public Scorer scorer(IndexReader reader) throws IOException {
137: DisjunctionMaxScorer result = new DisjunctionMaxScorer(
138: tieBreakerMultiplier, similarity);
139: for (int i = 0; i < weights.size(); i++) {
140: Weight w = (Weight) weights.get(i);
141: Scorer subScorer = w.scorer(reader);
142: if (subScorer == null)
143: return null;
144: result.add(subScorer);
145: }
146: return result;
147: }
148:
149: /* Explain the score we computed for doc */
150: public Explanation explain(IndexReader reader, int doc)
151: throws IOException {
152: if (disjuncts.size() == 1)
153: return ((Weight) weights.get(0)).explain(reader, doc);
154: ComplexExplanation result = new ComplexExplanation();
155: float max = 0.0f, sum = 0.0f;
156: result
157: .setDescription(tieBreakerMultiplier == 0.0f ? "max of:"
158: : "max plus " + tieBreakerMultiplier
159: + " times others of:");
160: for (int i = 0; i < weights.size(); i++) {
161: Explanation e = ((Weight) weights.get(i)).explain(
162: reader, doc);
163: if (e.isMatch()) {
164: result.setMatch(Boolean.TRUE);
165: result.addDetail(e);
166: sum += e.getValue();
167: max = Math.max(max, e.getValue());
168: }
169: }
170: result.setValue(max + (sum - max) * tieBreakerMultiplier);
171: return result;
172: }
173:
174: } // end of DisjunctionMaxWeight inner class
175:
176: /* Create the Weight used to score us */
177: protected Weight createWeight(Searcher searcher) throws IOException {
178: return new DisjunctionMaxWeight(searcher);
179: }
180:
181: /** Optimize our representation and our subqueries representations
182: * @param reader the IndexReader we query
183: * @return an optimized copy of us (which may not be a copy if there is nothing to optimize) */
184: public Query rewrite(IndexReader reader) throws IOException {
185: if (disjuncts.size() == 1) {
186: Query singleton = (Query) disjuncts.get(0);
187: Query result = singleton.rewrite(reader);
188: if (getBoost() != 1.0f) {
189: if (result == singleton)
190: result = (Query) result.clone();
191: result.setBoost(getBoost() * result.getBoost());
192: }
193: return result;
194: }
195: DisjunctionMaxQuery clone = null;
196: for (int i = 0; i < disjuncts.size(); i++) {
197: Query clause = (Query) disjuncts.get(i);
198: Query rewrite = clause.rewrite(reader);
199: if (rewrite != clause) {
200: if (clone == null)
201: clone = (DisjunctionMaxQuery) this .clone();
202: clone.disjuncts.set(i, rewrite);
203: }
204: }
205: if (clone != null)
206: return clone;
207: else
208: return this ;
209: }
210:
211: /** Create a shallow copy of us -- used in rewriting if necessary
212: * @return a copy of us (but reuse, don't copy, our subqueries) */
213: public Object clone() {
214: DisjunctionMaxQuery clone = (DisjunctionMaxQuery) super .clone();
215: clone.disjuncts = (ArrayList) this .disjuncts.clone();
216: return clone;
217: }
218:
219: // inherit javadoc
220: public void extractTerms(Set terms) {
221: for (int i = 0; i < disjuncts.size(); i++) {
222: ((Query) disjuncts.get(i)).extractTerms(terms);
223: }
224: }
225:
226: /** Prettyprint us.
227: * @param field the field to which we are applied
228: * @return a string that shows what we do, of the form "(disjunct1 | disjunct2 | ... | disjunctn)^boost"
229: */
230: public String toString(String field) {
231: StringBuffer buffer = new StringBuffer();
232: buffer.append("(");
233: for (int i = 0; i < disjuncts.size(); i++) {
234: Query subquery = (Query) disjuncts.get(i);
235: if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens
236: buffer.append("(");
237: buffer.append(subquery.toString(field));
238: buffer.append(")");
239: } else
240: buffer.append(subquery.toString(field));
241: if (i != disjuncts.size() - 1)
242: buffer.append(" | ");
243: }
244: buffer.append(")");
245: if (tieBreakerMultiplier != 0.0f) {
246: buffer.append("~");
247: buffer.append(tieBreakerMultiplier);
248: }
249: if (getBoost() != 1.0) {
250: buffer.append("^");
251: buffer.append(getBoost());
252: }
253: return buffer.toString();
254: }
255:
256: /** Return true iff we represent the same query as o
257: * @param o another object
258: * @return true iff o is a DisjunctionMaxQuery with the same boost and the same subqueries, in the same order, as us
259: */
260: public boolean equals(Object o) {
261: if (!(o instanceof DisjunctionMaxQuery))
262: return false;
263: DisjunctionMaxQuery other = (DisjunctionMaxQuery) o;
264: return this .getBoost() == other.getBoost()
265: && this .tieBreakerMultiplier == other.tieBreakerMultiplier
266: && this .disjuncts.equals(other.disjuncts);
267: }
268:
269: /** Compute a hash code for hashing us
270: * @return the hash code
271: */
272: public int hashCode() {
273: return Float.floatToIntBits(getBoost())
274: + Float.floatToIntBits(tieBreakerMultiplier)
275: + disjuncts.hashCode();
276: }
277:
278: }
|