001: package org.apache.lucene.analysis.ru;
002:
003: /**
004:
005: * Copyright 2004 The Apache Software Foundation
006:
007: *
008:
009: * Licensed under the Apache License, Version 2.0 (the "License");
010:
011: * you may not use this file except in compliance with the License.
012:
013: * You may obtain a copy of the License at
014:
015: *
016:
017: * http://www.apache.org/licenses/LICENSE-2.0
018:
019: *
020:
021: * Unless required by applicable law or agreed to in writing, software
022:
023: * distributed under the License is distributed on an "AS IS" BASIS,
024:
025: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
026:
027: * See the License for the specific language governing permissions and
028:
029: * limitations under the License.
030:
031: */
032:
033: import java.io.Reader;
034:
035: import org.apache.lucene.analysis.CharTokenizer;
036:
037: /**
038:
039: * A RussianLetterTokenizer is a tokenizer that extends LetterTokenizer by additionally looking up letters
040:
041: * in a given "russian charset". The problem with LeterTokenizer is that it uses Character.isLetter() method,
042:
043: * which doesn't know how to detect letters in encodings like CP1252 and KOI8
044:
045: * (well-known problems with 0xD7 and 0xF7 chars)
046:
047: *
048:
049: * @author Boris Okner, b.okner@rogers.com
050:
051: * @version $Id: RussianLetterTokenizer.java,v 1.1 2005/06/02 01:35:59 jfendler Exp $
052:
053: */
054:
055: public class RussianLetterTokenizer extends CharTokenizer
056:
057: {
058:
059: /** Construct a new LetterTokenizer. */
060:
061: private char[] charset;
062:
063: public RussianLetterTokenizer(Reader in, char[] charset)
064:
065: {
066:
067: super (in);
068:
069: this .charset = charset;
070:
071: }
072:
073: /**
074:
075: * Collects only characters which satisfy
076:
077: * {@link Character#isLetter(char)}.
078:
079: */
080:
081: protected boolean isTokenChar(char c)
082:
083: {
084:
085: if (Character.isLetter(c))
086:
087: return true;
088:
089: for (int i = 0; i < charset.length; i++)
090:
091: {
092:
093: if (c == charset[i])
094:
095: return true;
096:
097: }
098:
099: return false;
100:
101: }
102:
103: }
|