/*
* Copyright 2004-2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
namespace Lucene.Net.Analysis{
/// <summary> A filter that replaces accented characters in the ISO Latin 1 character set
/// (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
/// <p>
/// For instance, 'à' will be replaced by 'a'.
/// <p>
/// </summary>
public class ISOLatin1AccentFilter : TokenFilter
{
public ISOLatin1AccentFilter(TokenStream input):base(input)
{
}
public override Token Next()
{
Token t = input.Next();
if (t == null)
return null;
// Return a token with filtered characters.
return new Token(RemoveAccents(t.TermText()), t.StartOffset(), t.EndOffset(), t.Type());
}
/// <summary> To replace accented characters in a String by unaccented equivalents.</summary>
public static System.String RemoveAccents(System.String input)
{
System.Text.StringBuilder output = new System.Text.StringBuilder();
for (int i = 0; i < input.Length; i++)
{
switch (input[i])
{
case '\u00C0':
//
case '\u00C1':
// ?
case '\u00C2':
//
case '\u00C3':
//
case '\u00C4':
//
case '\u00C5': //
output.Append("A");
break;
case '\u00C6': //
output.Append("AE");
break;
case '\u00C7': //
output.Append("C");
break;
case '\u00C8':
//
case '\u00C9':
//
case '\u00CA':
//
case '\u00CB': //
output.Append("E");
break;
case '\u00CC':
//
case '\u00CD':
// ?
case '\u00CE':
//
case '\u00CF': // ?
output.Append("I");
break;
case '\u00D0': // ?
output.Append("D");
break;
case '\u00D1': //
output.Append("N");
break;
case '\u00D2':
//
case '\u00D3':
//
case '\u00D4':
//
case '\u00D5':
//
case '\u00D6':
//
case '\u00D8': //
output.Append("O");
break;
case '\u0152': //
output.Append("OE");
break;
case '\u00DE': //
output.Append("TH");
break;
case '\u00D9':
//
case '\u00DA':
//
case '\u00DB':
//
case '\u00DC': //
output.Append("U");
break;
case '\u00DD':
// ?
case '\u0178': //
output.Append("Y");
break;
case '\u00E0':
//
case '\u00E1':
//
case '\u00E2':
//
case '\u00E3':
//
case '\u00E4':
//
case '\u00E5': //
output.Append("a");
break;
case '\u00E6': //
output.Append("ae");
break;
case '\u00E7': //
output.Append("c");
break;
case '\u00E8':
//
case '\u00E9':
//
case '\u00EA':
//
case '\u00EB': //
output.Append("e");
break;
case '\u00EC':
//
case '\u00ED':
//
case '\u00EE':
//
case '\u00EF': //
output.Append("i");
break;
case '\u00F0': //
output.Append("d");
break;
case '\u00F1': //
output.Append("n");
break;
case '\u00F2':
//
case '\u00F3':
//
case '\u00F4':
//
case '\u00F5':
//
case '\u00F6':
//
case '\u00F8': //
output.Append("o");
break;
case '\u0153': //
output.Append("oe");
break;
case '\u00DF': //
output.Append("ss");
break;
case '\u00FE': //
output.Append("th");
break;
case '\u00F9':
//
case '\u00FA':
//
case '\u00FB':
//
case '\u00FC': //
output.Append("u");
break;
case '\u00FD':
//
case '\u00FF': //
output.Append("y");
break;
default:
output.Append(input[i]);
break;
}
}
return output.ToString();
}
}
}
|