001: package it.unimi.dsi.mg4j.search.score;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.ints.IntList;
025: import it.unimi.dsi.mg4j.index.Index;
026: import it.unimi.dsi.mg4j.search.DocumentIterator;
027: import it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor;
028: import it.unimi.dsi.mg4j.search.visitor.CounterSetupVisitor;
029: import it.unimi.dsi.mg4j.search.visitor.TermCollectionVisitor;
030:
031: import java.io.IOException;
032: import java.util.Arrays;
033:
034: import org.apache.log4j.Logger;
035:
036: /** A scorer that implements the BM25 ranking formula.
037: *
038: * <p><strong>Warning</strong>: the default values {@link #DEFAULT_K1} and {@link #DEFAULT_B}
039: * have changed in MG4J 1.1.2 (see below).
040: *
041: * <p>BM25 is the name of a formula derived from the probabilistic model. The essential feature
042: * of the formula is that of assigning to each term appearing in a given document a weight depending
043: * both on the count (the number of occurrences of the term in the document), on the frequency (the
044: * number of the documents in which the term appears) and on the document length (in words).
045: *
046: * <p>There are a number
047: * of incarnations with small variations of the formula itself. Here, the weight
048: * assigned to a term which appears in <var>f</var> documents out of a collection of <var>N</var> documents
049: * w.r.t. to a document of length <var>l</var> in which the term appears <var>c</var> times is
050: * <div style="text-align: center">
051: * log<big>(</big> (<var>N</var> − <var>f</var> + 1/2) / (f + 1/2) <big>)</big> ( <var>k</var><sub>1</sub> + 1 ) <var>c</var> <big>⁄</big> <big>(</big> <var>c</var> + <var>k</var><sub>1</sub> ((1 − <var>b</var>) + <var>b</var><var>l</var> / <var>L</var>) <big>)</big>,
052: * </div>
053: * where <var>L</var> is the average document length, and <var>k</var><sub>1</sub> and <var>b</var> are
054: * parameters that default to {@link #DEFAULT_K1} and {@link #DEFAULT_B}: these values were chosen
055: * following the suggestions given in
056: * “Efficiency vs. effectiveness in Terabyte-scale information retrieval”, by Stefan Büttcher and Charles L. A. Clarke,
057: * in <i>Proceedings of the 14th Text REtrieval
058: * Conference (TREC 2005)</i>. Gaithersburg, USA, November 2005. The logarithmic part (a.k.a.
059: * <em>idf (inverse document-frequency)</em> part) is actually
060: * maximised with {@link #EPSILON_SCORE}, so it is never negative (the net effect being that terms appearing
061: * in more than half of the documents have almost no weight).
062: *
063: * <p>This class uses a {@link it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor}
064: * and related classes to take into consideration only terms that are actually involved
065: * in the current document.
066: *
067: * @author Mauro Mereu
068: * @author Sebastiano Vigna
069: */
070: public class BM25Scorer extends AbstractWeightedScorer implements
071: DelegatingScorer {
072: private static final Logger LOGGER = Logger
073: .getLogger(BM25Scorer.class);
074: private static final boolean DEBUG = false;
075:
076: /** The default value used for the parameter <var>k</var><sub>1</sub>. */
077: public final static double DEFAULT_K1 = 1.2;
078: /** The default value used for the parameter <var>b</var>. */
079: public final static double DEFAULT_B = 0.5;
080: /** The value of the document-frequency part for terms appearing in more than half of the documents. */
081: public final static double EPSILON_SCORE = 1.000000082240371E-9; // TODO: put a better value
082:
083: /** The counter collection visitor used to estimate counts. */
084: private final CounterCollectionVisitor counterCollectionVisitor;
085: /** The counter setup visitor used to estimate counts. */
086: private final CounterSetupVisitor setupVisitor;
087: /** The term collection visitor used to estimate counts. */
088: private final TermCollectionVisitor termVisitor;
089:
090: /** The parameter <var>k</var><sub>1</sub>. */
091: public final double k1;
092: /** The parameter <var>b</var>. */
093: public final double b;
094:
095: /** The parameter {@link #k1} plus one, precomputed. */
096: private final double k1Plus1;
097: /** One minus {@link #b}, precomputed. */
098: private final double oneMinusB;
099:
100: /** An array (parallel to {@link #currIndex}) that caches average document sizes. */
101: private double averageDocumentSize[];
102: /** An array (parallel to {@link #currIndex}) that caches size lists. */
103: private IntList sizes[];
104: /** An array (parallel to {@link #currIndex}) used by {@link #score()} to cache the current document sizes. */
105: private int[] size;
106: /** An array indexed by offsets that caches the inverse document-frequency part of the formula, multiplied by the index weight. */
107: private double[] weightedIdfPart;
108:
109: public BM25Scorer() {
110: this (DEFAULT_K1, DEFAULT_B);
111: }
112:
113: public BM25Scorer(final double k1, final double b) {
114: termVisitor = new TermCollectionVisitor();
115: setupVisitor = new CounterSetupVisitor(termVisitor);
116: counterCollectionVisitor = new CounterCollectionVisitor(
117: setupVisitor);
118: this .k1 = k1;
119: this .b = b;
120: k1Plus1 = k1 + 1;
121: oneMinusB = 1 - b;
122: }
123:
124: public BM25Scorer(final String k1, final String b) {
125: this (Double.parseDouble(k1), Double.parseDouble(b));
126: }
127:
128: public synchronized BM25Scorer copy() {
129: final BM25Scorer scorer = new BM25Scorer(k1, b);
130: scorer.setWeights(index2Weight);
131: return scorer;
132: }
133:
134: public double score() throws IOException {
135: setupVisitor.clear();
136: documentIterator.acceptOnTruePaths(counterCollectionVisitor);
137:
138: final int document = documentIterator.document();
139: final int[] count = setupVisitor.count;
140: final int[] indexNumber = setupVisitor.indexNumber;
141: final double[] weightedIdfPart = this .weightedIdfPart;
142: final double[] averageDocumentSize = this .averageDocumentSize;
143: final int[] size = this .size;
144:
145: for (int i = currIndex.length; i-- != 0;)
146: size[i] = sizes[i].getInt(document);
147:
148: int k;
149: double score = 0;
150: for (int i = count.length; i-- != 0;) {
151: k = indexNumber[i];
152: score += (k1Plus1 * count[i])
153: / (count[i] + k1
154: * (oneMinusB + b * size[k]
155: / averageDocumentSize[k]))
156: * weightedIdfPart[i];
157: }
158: return score;
159: }
160:
161: public double score(final Index index) {
162: throw new UnsupportedOperationException();
163: }
164:
165: public void wrap(DocumentIterator d) throws IOException {
166: documentIterator = d;
167: termVisitor.prepare();
168: d.accept(termVisitor);
169:
170: if (DEBUG)
171: LOGGER.debug("Term Visitor found "
172: + termVisitor.numberOfPairs() + " leaves");
173:
174: // Note that we use the index array provided by the visitor, *not* by the iterator.
175: final Index[] index = termVisitor.indices();
176:
177: if (DEBUG)
178: LOGGER.debug("Indices: " + Arrays.toString(index));
179:
180: // Some caching of frequently-used values
181: sizes = new IntList[index.length];
182: for (int i = index.length; i-- != 0;)
183: if ((sizes[i] = index[i].sizes) == null)
184: throw new IllegalStateException(
185: "A BM25 scorer requires document sizes");
186:
187: averageDocumentSize = new double[index.length];
188: for (int i = index.length; i-- != 0;)
189: averageDocumentSize[i] = (double) (index[i].numberOfOccurrences)
190: / index[i].numberOfDocuments;
191:
192: if (DEBUG)
193: LOGGER.debug("Average document sizes: "
194: + Arrays.toString(averageDocumentSize));
195: setupVisitor.prepare();
196: d.accept(setupVisitor);
197:
198: final int[] frequency = setupVisitor.frequency;
199: final int[] indexNumber = setupVisitor.indexNumber;
200:
201: // We do all logs here, and multiply by the weight
202: weightedIdfPart = new double[frequency.length];
203: for (int i = weightedIdfPart.length; i-- != 0;)
204: weightedIdfPart[i] = Math.max(EPSILON_SCORE, Math
205: .log((index[indexNumber[i]].numberOfDocuments
206: - frequency[i] + 0.5)
207: / (frequency[i] + 0.5)))
208: * index2Weight.getDouble(index[indexNumber[i]]);
209:
210: size = new int[index.length];
211: currIndex = index;
212: }
213:
214: public boolean usesIntervals() {
215: return false;
216: }
217:
218: }
|