001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.lucene.misc;
017:
018: import org.apache.lucene.search.Similarity;
019: import org.apache.lucene.search.DefaultSimilarity;
020:
021: import java.util.Map;
022: import java.util.HashMap;
023:
024: /**
025: * A similarity with a lengthNorm that provides for a "platuea" of
026: * equally good lengths, and tf helper functions.
027: *
028: * <p>
029: * For lengthNorm, A global min/max can be specified to define the
030: * platuea of lengths that should all have a norm of 1.0.
031: * Below the min, and above the max the lengthNorm drops off in a
032: * sqrt function.
033: * </p>
034: * <p>
035: * A per field min/max can be specified if different fields have
036: * different sweet spots.
037: * </p>
038: *
039: * <p>
040: * For tf, baselineTf and hyperbolicTf functions are provided, which
041: * subclasses can choose between.
042: * </p>
043: *
044: */
045: public class SweetSpotSimilarity extends DefaultSimilarity {
046:
047: private int ln_min = 1;
048: private int ln_max = 1;
049: private float ln_steep = 0.5f;
050:
051: private Map ln_mins = new HashMap(7);
052: private Map ln_maxs = new HashMap(7);
053: private Map ln_steeps = new HashMap(7);
054:
055: private float tf_base = 0.0f;
056: private float tf_min = 0.0f;
057:
058: private float tf_hyper_min = 0.0f;
059: private float tf_hyper_max = 2.0f;
060: private double tf_hyper_base = 1.3d;
061: private float tf_hyper_xoffset = 10.0f;
062:
063: public SweetSpotSimilarity() {
064: super ();
065: }
066:
067: /**
068: * Sets the baseline and minimum function variables for baselineTf
069: *
070: * @see #baselineTf
071: */
072: public void setBaselineTfFactors(float base, float min) {
073: tf_min = min;
074: tf_base = base;
075: }
076:
077: /**
078: * Sets the function variables for the hyperbolicTf functions
079: *
080: * @param min the minimum tf value to ever be returned (default: 0.0)
081: * @param max the maximum tf value to ever be returned (default: 2.0)
082: * @param base the base value to be used in the exponential for the hyperbolic function (default: e)
083: * @param xoffset the midpoint of the hyperbolic function (default: 10.0)
084: * @see #hyperbolicTf
085: */
086: public void setHyperbolicTfFactors(float min, float max,
087: double base, float xoffset) {
088: tf_hyper_min = min;
089: tf_hyper_max = max;
090: tf_hyper_base = base;
091: tf_hyper_xoffset = xoffset;
092: }
093:
094: /**
095: * Sets the default function variables used by lengthNorm when no field
096: * specifc variables have been set.
097: *
098: * @see #lengthNorm
099: */
100: public void setLengthNormFactors(int min, int max, float steepness) {
101: this .ln_min = min;
102: this .ln_max = max;
103: this .ln_steep = steepness;
104: }
105:
106: /**
107: * Sets the function variables used by lengthNorm for a specific named field
108: *
109: * @see #lengthNorm
110: */
111: public void setLengthNormFactors(String field, int min, int max,
112: float steepness) {
113: ln_mins.put(field, new Integer(min));
114: ln_maxs.put(field, new Integer(max));
115: ln_steeps.put(field, new Float(steepness));
116: }
117:
118: /**
119: * Implemented as:
120: * <code>
121: * 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 )
122: * </code>.
123: *
124: * <p>
125: * This degrades to <code>1/sqrt(x)</code> when min and max are both 1 and
126: * steepness is 0.5
127: * </p>
128: *
129: * <p>
130: * :TODO: potential optimiation is to just flat out return 1.0f if numTerms
131: * is between min and max.
132: * </p>
133: *
134: * @see #setLengthNormFactors
135: */
136: public float lengthNorm(String fieldName, int numTerms) {
137: int l = ln_min;
138: int h = ln_max;
139: float s = ln_steep;
140:
141: if (ln_mins.containsKey(fieldName)) {
142: l = ((Number) ln_mins.get(fieldName)).intValue();
143: }
144: if (ln_maxs.containsKey(fieldName)) {
145: h = ((Number) ln_maxs.get(fieldName)).intValue();
146: }
147: if (ln_steeps.containsKey(fieldName)) {
148: s = ((Number) ln_steeps.get(fieldName)).floatValue();
149: }
150:
151: return (float) (1.0f / Math.sqrt((s * (float) (Math
152: .abs(numTerms - l)
153: + Math.abs(numTerms - h) - (h - l))) + 1.0f));
154: }
155:
156: /**
157: * Delegates to baselineTf
158: *
159: * @see #baselineTf
160: */
161: public float tf(int freq) {
162: return baselineTf(freq);
163: }
164:
165: /**
166: * Implemented as:
167: * <code>
168: * (x <= min) ? base : sqrt(x+(base**2)-min)
169: * </code>
170: * ...but with a special case check for 0.
171: * <p>
172: * This degrates to <code>sqrt(x)</code> when min and base are both 0
173: * </p>
174: *
175: * @see #setBaselineTfFactors
176: */
177: public float baselineTf(float freq) {
178:
179: if (0.0f == freq)
180: return 0.0f;
181:
182: return (freq <= tf_min) ? tf_base : (float) Math.sqrt(freq
183: + (tf_base * tf_base) - tf_min);
184: }
185:
186: /**
187: * Uses a hyperbolic tangent function that allows for a hard max...
188: *
189: * <code>
190: * tf(x)=min+(max-min)/2*(((base**(x-xoffset)-base**-(x-xoffset))/(base**(x-xoffset)+base**-(x-xoffset)))+1)
191: * </code>
192: *
193: * <p>
194: * This code is provided as a convincience for subclasses that want
195: * to use a hyperbolic tf function.
196: * </p>
197: *
198: * @see #setHyperbolicTfFactors
199: */
200: public float hyperbolicTf(float freq) {
201: if (0.0f == freq)
202: return 0.0f;
203:
204: final float min = tf_hyper_min;
205: final float max = tf_hyper_max;
206: final double base = tf_hyper_base;
207: final float xoffset = tf_hyper_xoffset;
208: final double x = (double) (freq - xoffset);
209:
210: final float result = min
211: + (float) ((max - min) / 2.0f * (((Math.pow(base, x) - Math
212: .pow(base, -x)) / (Math.pow(base, x) + Math
213: .pow(base, -x))) + 1.0d));
214:
215: return Float.isNaN(result) ? max : result;
216:
217: }
218:
219: }
|