PorterStemmer.cs : » Search-Engines » dotLucene » Lucene » Net » Analysis » C# / CSharp Open Source

1.	2.6.4 mono .net core
2.	2.6.4 mono core
3.	Aspect Oriented Frameworks
4.	Bloggers
5.	Build Systems
6.	Business Application
7.	Charting Reporting Tools
8.	Chat Servers
9.	Code Coverage Tools
10.	Content Management Systems CMS
11.	CRM ERP
12.	Database
13.	Development
14.	Email
15.	Forum
16.	Game
17.	GIS
18.	GUI
19.	IDEs
20.	Installers Generators
21.	Inversion of Control Dependency Injection
22.	Issue Tracking
23.	Logging Tools
24.	Message
25.	Mobile
26.	Network Clients
27.	Network Servers
28.	Office
29.	PDF
30.	Persistence Frameworks
31.	Portals
32.	Profilers
33.	Project Management
34.	RSS RDF
35.	Rule Engines
36.	Script
37.	Search Engines
38.	Sound Audio
39.	Source Control
40.	SQL Clients
41.	Template Engines
42.	Testing
43.	UML
44.	Web Frameworks
45.	Web Service
46.	Web Testing
47.	Wiki Engines
48.	Windows Presentation Foundation
49.	Workflows
50.	XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » Search Engines » dotLucene
dotLucene » Lucene » Net » Analysis » PorterStemmer.cs
/*
 * Copyright 2004 The Apache Software Foundation
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*

Porter stemmer in Java. The original paper is in

Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,

See also http://www.tartarus.org/~martin/PorterStemmer/index.html

Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
is then out outside the bounds of b.

Similarly,

Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
b[j] is then outside the bounds of b.

Release 3.

[ This version is derived from Release 3, modified by Brian Goetz to
optimize for fewer object creations.  ]
*/

using System;

namespace Lucene.Net.Analysis{
  
  /// <summary> 
  /// Stemmer, implementing the Porter Stemming Algorithm
  /// 
  /// The Stemmer class transforms a word into its root form.  The input
  /// word can be provided a character at time (by calling add()), or at once
  /// by calling one of the various stem(something) methods.
  /// </summary>
  
  class PorterStemmer
  {
    private char[] b;
    private int i, j, k, k0;
    private bool dirty = false;
    private const int INC = 50; /* unit of size whereby b is increased */
    private const int EXTRA = 1;
    
    public PorterStemmer()
    {
      b = new char[INC];
      i = 0;
    }
    
    /// <summary> reset() resets the stemmer so it can stem another word.  If you invoke
    /// the stemmer by calling add(char) and then Stem(), you must call reset()
    /// before starting another word.
    /// </summary>
    public virtual void  Reset()
    {
      i = 0; dirty = false;
    }
    
    /// <summary> Add a character to the word being stemmed.  When you are finished
    /// adding characters, you can call Stem(void) to process the word.
    /// </summary>
    public virtual void  Add(char ch)
    {
      if (b.Length <= i + EXTRA)
      {
        char[] new_b = new char[b.Length + INC];
        for (int c = 0; c < b.Length; c++)
          new_b[c] = b[c];
        b = new_b;
      }
      b[i++] = ch;
    }
    
    /// <summary> After a word has been stemmed, it can be retrieved by toString(),
    /// or a reference to the internal buffer can be retrieved by getResultBuffer
    /// and getResultLength (which is generally more efficient.)
    /// </summary>
    public override System.String ToString()
    {
      return new System.String(b, 0, i);
    }
    
    /// <summary> Returns the length of the word resulting from the stemming process.</summary>
    public virtual int GetResultLength()
    {
      return i;
    }
    
    /// <summary> Returns a reference to a character buffer containing the results of
    /// the stemming process.  You also need to consult getResultLength()
    /// to determine the length of the result.
    /// </summary>
    public virtual char[] GetResultBuffer()
    {
      return b;
    }
    
    /* cons(i) is true <=> b[i] is a consonant. */
    
    private bool Cons(int i)
    {
      switch (b[i])
      {
        
        case 'a': 
        case 'e': 
        case 'i': 
        case 'o': 
        case 'u': 
          return false;
        
        case 'y': 
          return (i == k0)?true:!Cons(i - 1);
        
        default: 
          return true;
        
      }
    }
    
    /* m() measures the number of consonant sequences between k0 and j. if c is
    a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
    presence,
    
    <c><v>       gives 0
    <c>vc<v>     gives 1
    <c>vcvc<v>   gives 2
    <c>vcvcvc<v> gives 3
    ....
    */
    
    private int M()
    {
      int n = 0;
      int i = k0;
      while (true)
      {
        if (i > j)
          return n;
        if (!Cons(i))
          break;
        i++;
      }
      i++;
      while (true)
      {
        while (true)
        {
          if (i > j)
            return n;
          if (Cons(i))
            break;
          i++;
        }
        i++;
        n++;
        while (true)
        {
          if (i > j)
            return n;
          if (!Cons(i))
            break;
          i++;
        }
        i++;
      }
    }
    
    /* vowelinstem() is true <=> k0,...j contains a vowel */
    
    private bool Vowelinstem()
    {
      int i;
      for (i = k0; i <= j; i++)
        if (!Cons(i))
          return true;
      return false;
    }
    
    /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
    
    private bool Doublec(int j)
    {
      if (j < k0 + 1)
        return false;
      if (b[j] != b[j - 1])
        return false;
      return Cons(j);
    }
    
    /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
    and also if the second c is not w,x or y. this is used when trying to
    restore an e at the end of a short word. e.g.
    
    cav(e), lov(e), hop(e), crim(e), but
    snow, box, tray.
    
    */
    
    private bool Cvc(int i)
    {
      if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2))
        return false;
      else
      {
        int ch = b[i];
        if (ch == 'w' || ch == 'x' || ch == 'y')
          return false;
      }
      return true;
    }
    
    private bool Ends(System.String s)
    {
      int l = s.Length;
      int o = k - l + 1;
      if (o < k0)
        return false;
      for (int i = 0; i < l; i++)
        if (b[o + i] != s[i])
          return false;
      j = k - l;
      return true;
    }
    
    /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
    k. */
    
    internal virtual void  Setto(System.String s)
    {
      int l = s.Length;
      int o = j + 1;
      for (int i = 0; i < l; i++)
        b[o + i] = s[i];
      k = j + l;
      dirty = true;
    }
    
    /* r(s) is used further down. */
    
    internal virtual void  R(System.String s)
    {
      if (M() > 0)
        Setto(s);
    }
    
    /* step1() gets rid of plurals and -ed or -ing. e.g.
    
    caresses  ->  caress
    ponies    ->  poni
    ties      ->  ti
    caress    ->  caress
    cats      ->  cat
    
    feed      ->  feed
    agreed    ->  agree
    disabled  ->  disable
    
    matting   ->  mat
    mating    ->  mate
    meeting   ->  meet
    milling   ->  mill
    messing   ->  mess
    
    meetings  ->  meet
    
    */
    
    private void  Step1()
    {
      if (b[k] == 's')
      {
        if (Ends("sses"))
          k -= 2;
        else if (Ends("ies"))
          Setto("i");
        else if (b[k - 1] != 's')
          k--;
      }
      if (Ends("eed"))
      {
        if (M() > 0)
          k--;
      }
      else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
      {
        k = j;
        if (Ends("at"))
          Setto("ate");
        else if (Ends("bl"))
          Setto("ble");
        else if (Ends("iz"))
          Setto("ize");
        else if (Doublec(k))
        {
          int ch = b[k--];
          if (ch == 'l' || ch == 's' || ch == 'z')
            k++;
        }
        else if (M() == 1 && Cvc(k))
          Setto("e");
      }
    }
    
    /* step2() turns terminal y to i when there is another vowel in the stem. */
    
    private void  Step2()
    {
      if (Ends("y") && Vowelinstem())
      {
        b[k] = 'i';
        dirty = true;
      }
    }
    
    /* step3() maps double suffices to single ones. so -ization ( = -ize plus
    -ation) maps to -ize etc. note that the string before the suffix must give
    m() > 0. */
    
    private void  Step3()
    {
      if (k == k0)
        return ; /* For Bug 1 */
      switch (b[k - 1])
      {
        
        case 'a': 
          if (Ends("ational"))
          {
            R("ate"); break;
          }
          if (Ends("tional"))
          {
            R("tion"); break;
          }
          break;
        
        case 'c': 
          if (Ends("enci"))
          {
            R("ence"); break;
          }
          if (Ends("anci"))
          {
            R("ance"); break;
          }
          break;
        
        case 'e': 
          if (Ends("izer"))
          {
            R("ize"); break;
          }
          break;
        
        case 'l': 
          if (Ends("bli"))
          {
            R("ble"); break;
          }
          if (Ends("alli"))
          {
            R("al"); break;
          }
          if (Ends("entli"))
          {
            R("ent"); break;
          }
          if (Ends("eli"))
          {
            R("e"); break;
          }
          if (Ends("ousli"))
          {
            R("ous"); break;
          }
          break;
        
        case 'o': 
          if (Ends("ization"))
          {
            R("ize"); break;
          }
          if (Ends("ation"))
          {
            R("ate"); break;
          }
          if (Ends("ator"))
          {
            R("ate"); break;
          }
          break;
        
        case 's': 
          if (Ends("alism"))
          {
            R("al"); break;
          }
          if (Ends("iveness"))
          {
            R("ive"); break;
          }
          if (Ends("fulness"))
          {
            R("ful"); break;
          }
          if (Ends("ousness"))
          {
            R("ous"); break;
          }
          break;
        
        case 't': 
          if (Ends("aliti"))
          {
            R("al"); break;
          }
          if (Ends("iviti"))
          {
            R("ive"); break;
          }
          if (Ends("biliti"))
          {
            R("ble"); break;
          }
          break;
        
        case 'g': 
          if (Ends("logi"))
          {
            R("log"); break;
          }
          break;
        }
    }
    
    /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
    
    private void  Step4()
    {
      switch (b[k])
      {
        
        case 'e': 
          if (Ends("icate"))
          {
            R("ic"); break;
          }
          if (Ends("ative"))
          {
            R(""); break;
          }
          if (Ends("alize"))
          {
            R("al"); break;
          }
          break;
        
        case 'i': 
          if (Ends("iciti"))
          {
            R("ic"); break;
          }
          break;
        
        case 'l': 
          if (Ends("ical"))
          {
            R("ic"); break;
          }
          if (Ends("ful"))
          {
            R(""); break;
          }
          break;
        
        case 's': 
          if (Ends("ness"))
          {
            R(""); break;
          }
          break;
        }
    }
    
    /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
    
    private void  Step5()
    {
      if (k == k0)
        return ; /* for Bug 1 */
      switch (b[k - 1])
      {
        
        case 'a': 
          if (Ends("al"))
            break;
          return ;
        
        case 'c': 
          if (Ends("ance"))
            break;
          if (Ends("ence"))
            break;
          return ;
        
        case 'e': 
          if (Ends("er"))
            break; return ;
        
        case 'i': 
          if (Ends("ic"))
            break; return ;
        
        case 'l': 
          if (Ends("able"))
            break;
          if (Ends("ible"))
            break; return ;
        
        case 'n': 
          if (Ends("ant"))
            break;
          if (Ends("ement"))
            break;
          if (Ends("ment"))
            break;
          /* element etc. not stripped before the m */
          if (Ends("ent"))
            break;
          return ;
        
        case 'o': 
          if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
            break;
          /* j >= 0 fixes Bug 2 */
          if (Ends("ou"))
            break;
          return ;
          /* takes care of -ous */
        
        case 's': 
          if (Ends("ism"))
            break;
          return ;
        
        case 't': 
          if (Ends("ate"))
            break;
          if (Ends("iti"))
            break;
          return ;
        
        case 'u': 
          if (Ends("ous"))
            break;
          return ;
        
        case 'v': 
          if (Ends("ive"))
            break;
          return ;
        
        case 'z': 
          if (Ends("ize"))
            break;
          return ;
        
        default: 
          return ;
        
      }
      if (M() > 1)
        k = j;
    }
    
    /* step6() removes a final -e if m() > 1. */
    
    private void  Step6()
    {
      j = k;
      if (b[k] == 'e')
      {
        int a = M();
        if (a > 1 || a == 1 && !Cvc(k - 1))
          k--;
      }
      if (b[k] == 'l' && Doublec(k) && M() > 1)
        k--;
    }
    
    
    /// <summary> Stem a word provided as a String.  Returns the result as a String.</summary>
    public virtual System.String Stem(System.String s)
    {
      if (Stem(s.ToCharArray(), s.Length))
      {
        return ToString();
      }
      else
        return s;
    }
    
    /// <summary>Stem a word contained in a char[].  Returns true if the stemming process
    /// resulted in a word different from the input.  You can retrieve the
    /// result with getResultLength()/getResultBuffer() or toString().
    /// </summary>
    public virtual bool Stem(char[] word)
    {
      return Stem(word, word.Length);
    }
    
    /// <summary>Stem a word contained in a portion of a char[] array.  Returns
    /// true if the stemming process resulted in a word different from
    /// the input.  You can retrieve the result with
    /// getResultLength()/getResultBuffer() or toString().
    /// </summary>
    public virtual bool Stem(char[] wordBuffer, int offset, int wordLen)
    {
      Reset();
      if (b.Length < wordLen)
      {
        char[] new_b = new char[wordLen + EXTRA];
        b = new_b;
      }
      for (int j = 0; j < wordLen; j++)
        b[j] = wordBuffer[offset + j];
      i = wordLen;
      return Stem(0);
    }
    
    /// <summary>Stem a word contained in a leading portion of a char[] array.
    /// Returns true if the stemming process resulted in a word different
    /// from the input.  You can retrieve the result with
    /// getResultLength()/getResultBuffer() or toString().
    /// </summary>
    public virtual bool Stem(char[] word, int wordLen)
    {
      return Stem(word, 0, wordLen);
    }
    
    /// <summary>Stem the word placed into the Stemmer buffer through calls to add().
    /// Returns true if the stemming process resulted in a word different
    /// from the input.  You can retrieve the result with
    /// getResultLength()/getResultBuffer() or toString().
    /// </summary>
    public virtual bool Stem()
    {
      return Stem(0);
    }
    
    public virtual bool Stem(int i0)
    {
      k = i - 1;
      k0 = i0;
      if (k > k0 + 1)
      {
        Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
      }
      // Also, a word is considered dirty if we lopped off letters
      // Thanks to Ifigenia Vairelles for pointing this out.
      if (i != k + 1)
        dirty = true;
      i = k + 1;
      return dirty;
    }
    
    /// <summary>Test program for demonstrating the Stemmer.  It reads a file and
    /// stems each word, writing the result to standard out.
    /// Usage: Stemmer file-name
    /// </summary>
    [STAThread]
    public static void  Main(System.String[] args)
    {
      PorterStemmer s = new PorterStemmer();
      
      for (int i = 0; i < args.Length; i++)
      {
        try
        {
                    System.IO.BinaryReader in_Renamed = new System.IO.BinaryReader(System.IO.File.Open(args[i], System.IO.FileMode.Open, System.IO.FileAccess.Read));
          byte[] buffer = new byte[1024];
          int bufferLen, offset, ch;
          
          bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
          offset = 0;
          s.Reset();
          
          while (true)
          {
            if (offset < bufferLen)
              ch = buffer[offset++];
            else
            {
              bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
              offset = 0;
              if (bufferLen <= 0)
                ch = - 1;
              else
                ch = buffer[offset++];
            }
            
            if (System.Char.IsLetter((char) ch))
            {
              s.Add(System.Char.ToLower((char) ch));
            }
            else
            {
              s.Stem();
              System.Console.Out.Write(s.ToString());
              s.Reset();
              if (ch < 0)
                break;
              else
              {
                System.Console.Out.Write((char) ch);
              }
            }
          }
          
          in_Renamed.Close();
        }
        catch (System.IO.IOException)
        {
          System.Console.Out.WriteLine("error reading " + args[i]);
        }
      }
    }
  }
}
www.java2v.com | Contact Us
All other trademarks are property of their respective owners.