FuzzyTermEnum.cs :  » Search-Engines » dotLucene » Lucene » Net » Search » C# / CSharp Open Source

Home
C# / CSharp Open Source
1.2.6.4 mono .net core
2.2.6.4 mono core
3.Aspect Oriented Frameworks
4.Bloggers
5.Build Systems
6.Business Application
7.Charting Reporting Tools
8.Chat Servers
9.Code Coverage Tools
10.Content Management Systems CMS
11.CRM ERP
12.Database
13.Development
14.Email
15.Forum
16.Game
17.GIS
18.GUI
19.IDEs
20.Installers Generators
21.Inversion of Control Dependency Injection
22.Issue Tracking
23.Logging Tools
24.Message
25.Mobile
26.Network Clients
27.Network Servers
28.Office
29.PDF
30.Persistence Frameworks
31.Portals
32.Profilers
33.Project Management
34.RSS RDF
35.Rule Engines
36.Script
37.Search Engines
38.Sound Audio
39.Source Control
40.SQL Clients
41.Template Engines
42.Testing
43.UML
44.Web Frameworks
45.Web Service
46.Web Testing
47.Wiki Engines
48.Windows Presentation Foundation
49.Workflows
50.XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » Search Engines » dotLucene 
dotLucene » Lucene » Net » Search » FuzzyTermEnum.cs
/*
 * Copyright 2004 The Apache Software Foundation
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

using System;
using IndexReaderLucene.Net.Index.IndexReader;
using TermLucene.Net.Index.Term;

namespace Lucene.Net.Search{
  
  /// <summary>Subclass of FilteredTermEnum for enumerating all terms that are similiar
  /// to the specified filter term.
  /// 
  /// <p>Term enumerations are always ordered by Term.compareTo().  Each term in
  /// the enumeration is greater than all that precede it.
  /// </summary>
  public sealed class FuzzyTermEnum : FilteredTermEnum
  {
    
    /* This should be somewhere around the average long word.
    * If it is longer, we waste time and space. If it is shorter, we waste a
    * little bit of time growing the array as we encounter longer words.
    */
    private const int TYPICAL_LONGEST_WORD_IN_INDEX = 19;
    
    /* Allows us save time required to create a new array
    * everytime similarity is called.
    */
    private int[][] d;
    
    private float similarity;
    private bool endEnum = false;
    
    private Term searchTerm = null;
    private System.String field;
    private System.String text;
    private System.String prefix;
    
    private float minimumSimilarity;
    private float scale_factor;
    private int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX];
    
    /// <summary> Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
    /// <p>
    /// After calling the constructor the enumeration is already pointing to the first 
    /// valid term if such a term exists. 
    /// 
    /// </summary>
    /// <param name="reader">
    /// </param>
    /// <param name="term">
    /// </param>
    /// <throws>  IOException </throws>
    /// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
    /// </seealso>
    public FuzzyTermEnum(IndexReader reader, Term term) : this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength)
    {
    }
    
    /// <summary> Creates a FuzzyTermEnum with an empty prefix.
    /// <p>
    /// After calling the constructor the enumeration is already pointing to the first 
    /// valid term if such a term exists. 
    /// 
    /// </summary>
    /// <param name="reader">
    /// </param>
    /// <param name="term">
    /// </param>
    /// <param name="minSimilarity">
    /// </param>
    /// <throws>  IOException </throws>
    /// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
    /// </seealso>
    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) : this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength)
    {
    }
    
    /// <summary> Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
    /// length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
    /// <code>minSimilarity</code>.
    /// <p>
    /// After calling the constructor the enumeration is already pointing to the first 
    /// valid term if such a term exists. 
    /// 
    /// </summary>
    /// <param name="reader">Delivers terms.
    /// </param>
    /// <param name="term">Pattern term.
    /// </param>
    /// <param name="minSimilarity">Minimum required similarity for terms from the reader. Default value is 0.5f.
    /// </param>
    /// <param name="prefixLength">Length of required common prefix. Default value is 0.
    /// </param>
    /// <throws>  IOException </throws>
    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) : base()
    {
      
      if (minSimilarity >= 1.0f)
        throw new System.ArgumentException("minimumSimilarity cannot be greater than or equal to 1");
      else if (minSimilarity < 0.0f)
        throw new System.ArgumentException("minimumSimilarity cannot be less than 0");
      if (prefixLength < 0)
        throw new System.ArgumentException("prefixLength cannot be less than 0");
      
      this.minimumSimilarity = minSimilarity;
      this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
      this.searchTerm = term;
      this.field = searchTerm.Field();
      
      //The prefix could be longer than the word.
      //It's kind of silly though.  It means we must match the entire word.
      int fullSearchTermLength = searchTerm.Text().Length;
      int realPrefixLength = prefixLength > fullSearchTermLength?fullSearchTermLength:prefixLength;
      
      this.text = searchTerm.Text().Substring(realPrefixLength);
      this.prefix = searchTerm.Text().Substring(0, (realPrefixLength) - (0));
      
      InitializeMaxDistances();
      this.d = InitDistanceArray();
      
      SetEnum(reader.Terms(new Term(searchTerm.Field(), prefix)));
    }
    
    /// <summary> The termCompare method in FuzzyTermEnum uses Levenshtein distance to 
    /// calculate the distance between the given term and the comparing term. 
    /// </summary>
    protected internal override bool TermCompare(Term term)
    {
      if (field == term.Field() && term.Text().StartsWith(prefix))
      {
        System.String target = term.Text().Substring(prefix.Length);
        this.similarity = Similarity(target);
        return (similarity > minimumSimilarity);
      }
      endEnum = true;
      return false;
    }
    
    public override float Difference()
    {
      return (float) ((similarity - minimumSimilarity) * scale_factor);
    }
    
    public override bool EndEnum()
    {
      return endEnum;
    }
    
    /// <summary>***************************
    /// Compute Levenshtein distance
    /// ****************************
    /// </summary>
    
    /// <summary> Finds and returns the smallest of three integers </summary>
    private static int min(int a, int b, int c)
    {
      int t = (a < b) ? a : b;
      return (t < c) ? t : c;
    }
    
    private int[][] InitDistanceArray()
    {
      int[][] tmpArray = new int[this.text.Length + 1][];
      for (int i = 0; i < this.text.Length + 1; i++)
      {
        tmpArray[i] = new int[TYPICAL_LONGEST_WORD_IN_INDEX];
      }
      return tmpArray;
    }
    
    /// <summary> <p>Similarity returns a number that is 1.0f or less (including negative numbers)
    /// based on how similar the Term is compared to a target term.  It returns
    /// exactly 0.0f when
    /// <pre>
    /// editDistance &lt; maximumEditDistance</pre>
    /// Otherwise it returns:
    /// <pre>
    /// 1 - (editDistance / length)</pre>
    /// where length is the length of the shortest term (text or target) including a
    /// prefix that are identical and editDistance is the Levenshtein distance for
    /// the two words.</p>
    /// 
    /// <p>Embedded within this algorithm is a fail-fast Levenshtein distance
    /// algorithm.  The fail-fast algorithm differs from the standard Levenshtein
    /// distance algorithm in that it is aborted if it is discovered that the
    /// mimimum distance between the words is greater than some threshold.
    /// 
    /// <p>To calculate the maximum distance threshold we use the following formula:
    /// <pre>
    /// (1 - minimumSimilarity) * length</pre>
    /// where length is the shortest term including any prefix that is not part of the
    /// similarity comparision.  This formula was derived by solving for what maximum value
    /// of distance returns false for the following statements:
    /// <pre>
    /// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
    /// return (similarity > minimumSimilarity);</pre>
    /// where distance is the Levenshtein distance for the two words.
    /// </p>
    /// <p>Levenshtein distance (also known as edit distance) is a measure of similiarity
    /// between two strings where the distance is measured as the number of character
    /// deletions, insertions or substitutions required to transform one string to
    /// the other string.
    /// </summary>
    /// <param name="target">the target word or phrase
    /// </param>
    /// <returns> the similarity,  0.0 or less indicates that it matches less than the required
    /// threshold and 1.0 indicates that the text and target are identical
    /// </returns>
    private float Similarity(System.String target)
    {
      lock (this)
      {
        int m = target.Length;
        int n = text.Length;
        if (n == 0)
        {
          //we don't have anything to compare.  That means if we just add
          //the letters for m we get the new word
          return prefix.Length == 0 ? 0.0f : 1.0f - ((float) m / prefix.Length);
        }
        if (m == 0)
        {
          return prefix.Length == 0 ? 0.0f : 1.0f - ((float) n / prefix.Length);
        }
        
        int maxDistance = GetMaxDistance(m);
        
        if (maxDistance < System.Math.Abs(m - n))
        {
          //just adding the characters of m to n or vice-versa results in
          //too many edits
          //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
          //given this optimal circumstance, the edit distance cannot be less than 5.
          //which is 8-3 or more precisesly Math.abs(3-8).
          //if our maximum edit distance is 4, then we can discard this word
          //without looking at it.
          return 0.0f;
        }
        
        //let's make sure we have enough room in our array to do the distance calculations.
        if (d[0].Length <= m)
        {
          GrowDistanceArray(m);
        }
        
        // init matrix d
        for (int i = 0; i <= n; i++)
          d[i][0] = i;
        for (int j = 0; j <= m; j++)
          d[0][j] = j;
        
        // start computing edit distance
        for (int i = 1; i <= n; i++)
        {
          int bestPossibleEditDistance = m;
          char s_i = text[i - 1];
          for (int j = 1; j <= m; j++)
          {
            if (s_i != target[j - 1])
            {
              d[i][j] = min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1;
            }
            else
            {
              d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]);
            }
            bestPossibleEditDistance = System.Math.Min(bestPossibleEditDistance, d[i][j]);
          }
          
          //After calculating row i, the best possible edit distance
          //can be found by found by finding the smallest value in a given column.
          //If the bestPossibleEditDistance is greater than the max distance, abort.
          
          if (i > maxDistance && bestPossibleEditDistance > maxDistance)
          {
            //equal is okay, but not greater
            //the closest the target can be to the text is just too far away.
            //this target is leaving the party early.
            return 0.0f;
          }
        }
        
        // this will return less than 0.0 when the edit distance is
        // greater than the number of characters in the shorter word.
        // but this was the formula that was previously used in FuzzyTermEnum,
        // so it has not been changed (even though minimumSimilarity must be
        // greater than 0.0)
        return 1.0f - ((float) d[n][m] / (float) (prefix.Length + System.Math.Min(n, m)));
      }
    }
    
    /// <summary> Grow the second dimension of the array, so that we can calculate the
    /// Levenshtein difference.
    /// </summary>
    private void  GrowDistanceArray(int m)
    {
      for (int i = 0; i < d.Length; i++)
      {
        d[i] = new int[m + 1];
      }
    }
    
    /// <summary> The max Distance is the maximum Levenshtein distance for the text
    /// compared to some other value that results in score that is
    /// better than the minimum similarity.
    /// </summary>
    /// <param name="m">the length of the "other value"
    /// </param>
    /// <returns> the maximum levenshtein distance that we care about
    /// </returns>
    private int GetMaxDistance(int m)
    {
      return (m < maxDistances.Length)?maxDistances[m]:CalculateMaxDistance(m);
    }
    
    private void  InitializeMaxDistances()
    {
      for (int i = 0; i < maxDistances.Length; i++)
      {
        maxDistances[i] = CalculateMaxDistance(i);
      }
    }
    
    private int CalculateMaxDistance(int m)
    {
      return (int) ((1 - minimumSimilarity) * (System.Math.Min(text.Length, m) + prefix.Length));
    }
    
    public override void  Close()
    {
      base.Close(); //call super.close() and let the garbage collector do its work.
    }
  }
}
www.java2v.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.