001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.index.Term;
021: import org.apache.lucene.util.SmallFloat;
022:
023: import java.io.IOException;
024: import java.io.Serializable;
025: import java.util.Collection;
026: import java.util.Iterator;
027:
028: /** Expert: Scoring API.
029: * <p>Subclasses implement search scoring.
030: *
031: * <p>The score of query <code>q</code> for document <code>d</code> correlates to the
032: * cosine-distance or dot-product between document and query vectors in a
033: * <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
034: * Vector Space Model (VSM) of Information Retrieval</a>.
035: * A document whose vector is closer to the query vector in that model is scored higher.
036: *
037: * The score is computed as follows:
038: *
039: * <P>
040: * <table cellpadding="1" cellspacing="0" border="1" align="center">
041: * <tr><td>
042: * <table cellpadding="1" cellspacing="0" border="0" align="center">
043: * <tr>
044: * <td valign="middle" align="right" rowspan="1">
045: * score(q,d) =
046: * <A HREF="#formula_coord">coord(q,d)</A> ·
047: * <A HREF="#formula_queryNorm">queryNorm(q)</A> ·
048: * </td>
049: * <td valign="bottom" align="center" rowspan="1">
050: * <big><big><big>∑</big></big></big>
051: * </td>
052: * <td valign="middle" align="right" rowspan="1">
053: * <big><big>(</big></big>
054: * <A HREF="#formula_tf">tf(t in d)</A> ·
055: * <A HREF="#formula_idf">idf(t)</A><sup>2</sup> ·
056: * <A HREF="#formula_termBoost">t.getBoost()</A> ·
057: * <A HREF="#formula_norm">norm(t,d)</A>
058: * <big><big>)</big></big>
059: * </td>
060: * </tr>
061: * <tr valigh="top">
062: * <td></td>
063: * <td align="center"><small>t in q</small></td>
064: * <td></td>
065: * </tr>
066: * </table>
067: * </td></tr>
068: * </table>
069: *
070: * <p> where
071: * <ol>
072: * <li>
073: * <A NAME="formula_tf"></A>
074: * <b>tf(t in d)</b>
075: * correlates to the term's <i>frequency</i>,
076: * defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
077: * Documents that have more occurrences of a given term receive a higher score.
078: * The default computation for <i>tf(t in d)</i> in
079: * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
080: *
081: * <br> <br>
082: * <table cellpadding="2" cellspacing="2" border="0" align="center">
083: * <tr>
084: * <td valign="middle" align="right" rowspan="1">
085: * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} =
086: * </td>
087: * <td valign="top" align="center" rowspan="1">
088: * frequency<sup><big>½</big></sup>
089: * </td>
090: * </tr>
091: * </table>
092: * <br> <br>
093: * </li>
094: *
095: * <li>
096: * <A NAME="formula_idf"></A>
097: * <b>idf(t)</b> stands for Inverse Document Frequency. This value
098: * correlates to the inverse of <i>docFreq</i>
099: * (the number of documents in which the term <i>t</i> appears).
100: * This means rarer terms give higher contribution to the total score.
101: * The default computation for <i>idf(t)</i> in
102: * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
103: *
104: * <br> <br>
105: * <table cellpadding="2" cellspacing="2" border="0" align="center">
106: * <tr>
107: * <td valign="middle" align="right">
108: * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)} =
109: * </td>
110: * <td valign="middle" align="center">
111: * 1 + log <big>(</big>
112: * </td>
113: * <td valign="middle" align="center">
114: * <table>
115: * <tr><td align="center"><small>numDocs</small></td></tr>
116: * <tr><td align="center">–––––––––</td></tr>
117: * <tr><td align="center"><small>docFreq+1</small></td></tr>
118: * </table>
119: * </td>
120: * <td valign="middle" align="center">
121: * <big>)</big>
122: * </td>
123: * </tr>
124: * </table>
125: * <br> <br>
126: * </li>
127: *
128: * <li>
129: * <A NAME="formula_coord"></A>
130: * <b>coord(q,d)</b>
131: * is a score factor based on how many of the query terms are found in the specified document.
132: * Typically, a document that contains more of the query's terms will receive a higher score
133: * than another document with fewer query terms.
134: * This is a search time factor computed in
135: * {@link #coord(int, int) coord(q,d)}
136: * by the Similarity in effect at search time.
137: * <br> <br>
138: * </li>
139: *
140: * <li><b>
141: * <A NAME="formula_queryNorm"></A>
142: * queryNorm(q)
143: * </b>
144: * is a normalizing factor used to make scores between queries comparable.
145: * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
146: * but rather just attempts to make scores from different queries (or even different indexes) comparable.
147: * This is a search time factor computed by the Similarity in effect at search time.
148: *
149: * The default computation in
150: * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) DefaultSimilarity}
151: * is:
152: * <br> <br>
153: * <table cellpadding="1" cellspacing="0" border="0" align="center">
154: * <tr>
155: * <td valign="middle" align="right" rowspan="1">
156: * queryNorm(q) =
157: * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)}
158: * =
159: * </td>
160: * <td valign="middle" align="center" rowspan="1">
161: * <table>
162: * <tr><td align="center"><big>1</big></td></tr>
163: * <tr><td align="center"><big>
164: * ––––––––––––––
165: * </big></td></tr>
166: * <tr><td align="center">sumOfSquaredWeights<sup><big>½</big></sup></td></tr>
167: * </table>
168: * </td>
169: * </tr>
170: * </table>
171: * <br> <br>
172: *
173: * The sum of squared weights (of the query terms) is
174: * computed by the query {@link org.apache.lucene.search.Weight} object.
175: * For example, a {@link org.apache.lucene.search.BooleanQuery boolean query}
176: * computes this value as:
177: *
178: * <br> <br>
179: * <table cellpadding="1" cellspacing="0" border="0"n align="center">
180: * <tr>
181: * <td valign="middle" align="right" rowspan="1">
182: * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} =
183: * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
184: * ·
185: * </td>
186: * <td valign="bottom" align="center" rowspan="1">
187: * <big><big><big>∑</big></big></big>
188: * </td>
189: * <td valign="middle" align="right" rowspan="1">
190: * <big><big>(</big></big>
191: * <A HREF="#formula_idf">idf(t)</A> ·
192: * <A HREF="#formula_termBoost">t.getBoost()</A>
193: * <big><big>) <sup>2</sup> </big></big>
194: * </td>
195: * </tr>
196: * <tr valigh="top">
197: * <td></td>
198: * <td align="center"><small>t in q</small></td>
199: * <td></td>
200: * </tr>
201: * </table>
202: * <br> <br>
203: *
204: * </li>
205: *
206: * <li>
207: * <A NAME="formula_termBoost"></A>
208: * <b>t.getBoost()</b>
209: * is a search time boost of term <i>t</i> in the query <i>q</i> as
210: * specified in the query text
211: * (see <A HREF="../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
212: * or as set by application calls to
213: * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
214: * Notice that there is really no direct API for accessing a boost of one term in a multi term query,
215: * but rather multi terms are represented in a query as multi
216: * {@link org.apache.lucene.search.TermQuery TermQuery} objects,
217: * and so the boost of a term in the query is accessible by calling the sub-query
218: * {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
219: * <br> <br>
220: * </li>
221: *
222: * <li>
223: * <A NAME="formula_norm"></A>
224: * <b>norm(t,d)</b> encapsulates a few (indexing time) boost and length factors:
225: *
226: * <ul>
227: * <li><b>Document boost</b> - set by calling
228: * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
229: * before adding the document to the index.
230: * </li>
231: * <li><b>Field boost</b> - set by calling
232: * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
233: * before adding the field to a document.
234: * </li>
235: * <li>{@link #lengthNorm(String, int) <b>lengthNorm</b>(field)} - computed
236: * when the document is added to the index in accordance with the number of tokens
237: * of this field in the document, so that shorter fields contribute more to the score.
238: * LengthNorm is computed by the Similarity class in effect at indexing.
239: * </li>
240: * </ul>
241: *
242: * <p>
243: * When a document is added to the index, all the above factors are multiplied.
244: * If the document has multiple fields with the same name, all their boosts are multiplied together:
245: *
246: * <br> <br>
247: * <table cellpadding="1" cellspacing="0" border="0"n align="center">
248: * <tr>
249: * <td valign="middle" align="right" rowspan="1">
250: * norm(t,d) =
251: * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
252: * ·
253: * {@link #lengthNorm(String, int) lengthNorm(field)}
254: * ·
255: * </td>
256: * <td valign="bottom" align="center" rowspan="1">
257: * <big><big><big>∏</big></big></big>
258: * </td>
259: * <td valign="middle" align="right" rowspan="1">
260: * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
261: * </td>
262: * </tr>
263: * <tr valigh="top">
264: * <td></td>
265: * <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
266: * <td></td>
267: * </tr>
268: * </table>
269: * <br> <br>
270: * However the resulted <i>norm</i> value is {@link #encodeNorm(float) encoded} as a single byte
271: * before being stored.
272: * At search time, the norm byte value is read from the index
273: * {@link org.apache.lucene.store.Directory directory} and
274: * {@link #decodeNorm(byte) decoded} back to a float <i>norm</i> value.
275: * This encoding/decoding, while reducing index size, comes with the price of
276: * precision loss - it is not guaranteed that decode(encode(x)) = x.
277: * For instance, decode(encode(0.89)) = 0.75.
278: * Also notice that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
279: * using a different {@link Similarity} for search.
280: * <br> <br>
281: * </li>
282: * </ol>
283: *
284: * @see #setDefault(Similarity)
285: * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
286: * @see Searcher#setSimilarity(Similarity)
287: */
288: public abstract class Similarity implements Serializable {
289: /** The Similarity implementation used by default. */
290: private static Similarity defaultImpl = new DefaultSimilarity();
291:
292: /** Set the default Similarity implementation used by indexing and search
293: * code.
294: *
295: * @see Searcher#setSimilarity(Similarity)
296: * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
297: */
298: public static void setDefault(Similarity similarity) {
299: Similarity.defaultImpl = similarity;
300: }
301:
302: /** Return the default Similarity implementation used by indexing and search
303: * code.
304: *
305: * <p>This is initially an instance of {@link DefaultSimilarity}.
306: *
307: * @see Searcher#setSimilarity(Similarity)
308: * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity)
309: */
310: public static Similarity getDefault() {
311: return Similarity.defaultImpl;
312: }
313:
314: /** Cache of decoded bytes. */
315: private static final float[] NORM_TABLE = new float[256];
316:
317: static {
318: for (int i = 0; i < 256; i++)
319: NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte) i);
320: }
321:
322: /** Decodes a normalization factor stored in an index.
323: * @see #encodeNorm(float)
324: */
325: public static float decodeNorm(byte b) {
326: return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
327: }
328:
329: /** Returns a table for decoding normalization bytes.
330: * @see #encodeNorm(float)
331: */
332: public static float[] getNormDecoder() {
333: return NORM_TABLE;
334: }
335:
336: /** Computes the normalization value for a field given the total number of
337: * terms contained in a field. These values, together with field boosts, are
338: * stored in an index and multipled into scores for hits on each field by the
339: * search code.
340: *
341: * <p>Matches in longer fields are less precise, so implementations of this
342: * method usually return smaller values when <code>numTokens</code> is large,
343: * and larger values when <code>numTokens</code> is small.
344: *
345: * <p>That these values are computed under
346: * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
347: * and stored then using
348: * {@link #encodeNorm(float)}.
349: * Thus they have limited precision, and documents
350: * must be re-indexed if this method is altered.
351: *
352: * @param fieldName the name of the field
353: * @param numTokens the total number of tokens contained in fields named
354: * <i>fieldName</i> of <i>doc</i>.
355: * @return a normalization factor for hits on this field of this document
356: *
357: * @see org.apache.lucene.document.Field#setBoost(float)
358: */
359: public abstract float lengthNorm(String fieldName, int numTokens);
360:
361: /** Computes the normalization value for a query given the sum of the squared
362: * weights of each of the query terms. This value is then multipled into the
363: * weight of each query term.
364: *
365: * <p>This does not affect ranking, but rather just attempts to make scores
366: * from different queries comparable.
367: *
368: * @param sumOfSquaredWeights the sum of the squares of query term weights
369: * @return a normalization factor for query weights
370: */
371: public abstract float queryNorm(float sumOfSquaredWeights);
372:
373: /** Encodes a normalization factor for storage in an index.
374: *
375: * <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
376: * the zero-exponent point at 15, thus
377: * representing values from around 7x10^9 to 2x10^-9 with about one
378: * significant decimal digit of accuracy. Zero is also represented.
379: * Negative numbers are rounded up to zero. Values too large to represent
380: * are rounded down to the largest representable value. Positive values too
381: * small to represent are rounded up to the smallest positive representable
382: * value.
383: *
384: * @see org.apache.lucene.document.Field#setBoost(float)
385: * @see org.apache.lucene.util.SmallFloat
386: */
387: public static byte encodeNorm(float f) {
388: return SmallFloat.floatToByte315(f);
389: }
390:
391: /** Computes a score factor based on a term or phrase's frequency in a
392: * document. This value is multiplied by the {@link #idf(Term, Searcher)}
393: * factor for each term in the query and these products are then summed to
394: * form the initial score for a document.
395: *
396: * <p>Terms and phrases repeated in a document indicate the topic of the
397: * document, so implementations of this method usually return larger values
398: * when <code>freq</code> is large, and smaller values when <code>freq</code>
399: * is small.
400: *
401: * <p>The default implementation calls {@link #tf(float)}.
402: *
403: * @param freq the frequency of a term within a document
404: * @return a score factor based on a term's within-document frequency
405: */
406: public float tf(int freq) {
407: return tf((float) freq);
408: }
409:
410: /** Computes the amount of a sloppy phrase match, based on an edit distance.
411: * This value is summed for each sloppy phrase match in a document to form
412: * the frequency that is passed to {@link #tf(float)}.
413: *
414: * <p>A phrase match with a small edit distance to a document passage more
415: * closely matches the document, so implementations of this method usually
416: * return larger values when the edit distance is small and smaller values
417: * when it is large.
418: *
419: * @see PhraseQuery#setSlop(int)
420: * @param distance the edit distance of this sloppy phrase match
421: * @return the frequency increment for this match
422: */
423: public abstract float sloppyFreq(int distance);
424:
425: /** Computes a score factor based on a term or phrase's frequency in a
426: * document. This value is multiplied by the {@link #idf(Term, Searcher)}
427: * factor for each term in the query and these products are then summed to
428: * form the initial score for a document.
429: *
430: * <p>Terms and phrases repeated in a document indicate the topic of the
431: * document, so implementations of this method usually return larger values
432: * when <code>freq</code> is large, and smaller values when <code>freq</code>
433: * is small.
434: *
435: * @param freq the frequency of a term within a document
436: * @return a score factor based on a term's within-document frequency
437: */
438: public abstract float tf(float freq);
439:
440: /** Computes a score factor for a simple term.
441: *
442: * <p>The default implementation is:<pre>
443: * return idf(searcher.docFreq(term), searcher.maxDoc());
444: * </pre>
445: *
446: * Note that {@link Searcher#maxDoc()} is used instead of
447: * {@link org.apache.lucene.index.IndexReader#numDocs()} because it is proportional to
448: * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
449: * so is the other, and in the same direction.
450: *
451: * @param term the term in question
452: * @param searcher the document collection being searched
453: * @return a score factor for the term
454: */
455: public float idf(Term term, Searcher searcher) throws IOException {
456: return idf(searcher.docFreq(term), searcher.maxDoc());
457: }
458:
459: /** Computes a score factor for a phrase.
460: *
461: * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
462: * for each term in the phrase.
463: *
464: * @param terms the terms in the phrase
465: * @param searcher the document collection being searched
466: * @return a score factor for the phrase
467: */
468: public float idf(Collection terms, Searcher searcher)
469: throws IOException {
470: float idf = 0.0f;
471: Iterator i = terms.iterator();
472: while (i.hasNext()) {
473: idf += idf((Term) i.next(), searcher);
474: }
475: return idf;
476: }
477:
478: /** Computes a score factor based on a term's document frequency (the number
479: * of documents which contain the term). This value is multiplied by the
480: * {@link #tf(int)} factor for each term in the query and these products are
481: * then summed to form the initial score for a document.
482: *
483: * <p>Terms that occur in fewer documents are better indicators of topic, so
484: * implementations of this method usually return larger values for rare terms,
485: * and smaller values for common terms.
486: *
487: * @param docFreq the number of documents which contain the term
488: * @param numDocs the total number of documents in the collection
489: * @return a score factor based on the term's document frequency
490: */
491: public abstract float idf(int docFreq, int numDocs);
492:
493: /** Computes a score factor based on the fraction of all query terms that a
494: * document contains. This value is multiplied into scores.
495: *
496: * <p>The presence of a large portion of the query terms indicates a better
497: * match with the query, so implementations of this method usually return
498: * larger values when the ratio between these parameters is large and smaller
499: * values when the ratio between them is small.
500: *
501: * @param overlap the number of query terms matched in the document
502: * @param maxOverlap the total number of terms in the query
503: * @return a score factor based on term overlap with the query
504: */
505: public abstract float coord(int overlap, int maxOverlap);
506:
507: /**
508: * Calculate a scoring factor based on the data in the payload. Overriding implementations
509: * are responsible for interpreting what is in the payload. Lucene makes no assumptions about
510: * what is in the byte array.
511: * <p>
512: * The default implementation returns 1.
513: *
514: * @param fieldName The fieldName of the term this payload belongs to
515: * @param payload The payload byte array to be scored
516: * @param offset The offset into the payload array
517: * @param length The length in the array
518: * @return An implementation dependent float to be used as a scoring factor
519: */
520: public float scorePayload(String fieldName, byte[] payload,
521: int offset, int length) {
522: //Do nothing
523: return 1;
524: }
525: }
|