001: package org.apache.lucene.analysis.el;
002:
003: /**
004: * Copyright 2005 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import org.apache.lucene.analysis.Analyzer;
020: import org.apache.lucene.analysis.StopFilter;
021: import org.apache.lucene.analysis.TokenStream;
022: import org.apache.lucene.analysis.standard.StandardTokenizer;
023:
024: import java.io.Reader;
025: import java.util.HashSet;
026: import java.util.Hashtable;
027: import java.util.Set;
028:
029: /**
030: * Analyzer for the Greek language. Supports an external list of stopwords (words
031: * that will not be indexed at all).
032: * A default set of stopwords is used unless an alternative list is specified.
033: *
034: * @author Panagiotis Astithas, past@ebs.gr
035: */
036: public final class GreekAnalyzer extends Analyzer {
037: // the letters are indexes to the charset array (see GreekCharsets.java)
038: private static char A = 6;
039: private static char B = 7;
040: private static char G = 8;
041: private static char D = 9;
042: private static char E = 10;
043: private static char Z = 11;
044: private static char H = 12;
045: private static char TH = 13;
046: private static char I = 14;
047: private static char K = 15;
048: private static char L = 16;
049: private static char M = 17;
050: private static char N = 18;
051: private static char KS = 19;
052: private static char O = 20;
053: private static char P = 21;
054: private static char R = 22;
055: private static char S = 24; // skip final sigma
056: private static char T = 25;
057: private static char Y = 26;
058: private static char F = 27;
059: private static char X = 28;
060: private static char PS = 29;
061: private static char W = 30;
062:
063: /**
064: * List of typical Greek stopwords.
065: */
066: private static char[][] GREEK_STOP_WORDS = { { O }, { H },
067: { T, O }, { O, I }, { T, A }, { T, O, Y }, { T, H, S },
068: { T, W, N }, { T, O, N }, { T, H, N }, { K, A, I },
069: { K, I }, { K }, { E, I, M, A, I }, { E, I, S, A, I },
070: { E, I, N, A, I }, { E, I, M, A, S, T, E },
071: { E, I, S, T, E }, { S, T, O }, { S, T, O, N },
072: { S, T, H }, { S, T, H, N }, { M, A }, { A, L, L, A },
073: { A, P, O }, { G, I, A }, { P, R, O, S }, { M, E },
074: { S, E }, { W, S }, { P, A, R, A }, { A, N, T, I },
075: { K, A, T, A }, { M, E, T, A }, { TH, A }, { N, A },
076: { D, E }, { D, E, N }, { M, H }, { M, H, N }, { E, P, I },
077: { E, N, W }, { E, A, N }, { A, N }, { T, O, T, E },
078: { P, O, Y }, { P, W, S }, { P, O, I, O, S },
079: { P, O, I, A }, { P, O, I, O }, { P, O, I, O, I },
080: { P, O, I, E, S }, { P, O, I, W, N }, { P, O, I, O, Y, S },
081: { A, Y, T, O, S }, { A, Y, T, H }, { A, Y, T, O },
082: { A, Y, T, O, I }, { A, Y, T, W, N }, { A, Y, T, O, Y, S },
083: { A, Y, T, E, S }, { A, Y, T, A }, { E, K, E, I, N, O, S },
084: { E, K, E, I, N, H }, { E, K, E, I, N, O },
085: { E, K, E, I, N, O, I }, { E, K, E, I, N, E, S },
086: { E, K, E, I, N, A }, { E, K, E, I, N, W, N },
087: { E, K, E, I, N, O, Y, S }, { O, P, W, S }, { O, M, W, S },
088: { I, S, W, S }, { O, S, O }, { O, T, I } };
089:
090: /**
091: * Contains the stopwords used with the StopFilter.
092: */
093: private Set stopSet = new HashSet();
094:
095: /**
096: * Charset for Greek letters.
097: * Represents encoding for 24 lowercase Greek letters.
098: * Predefined charsets can be taken from GreekCharSets class
099: */
100: private char[] charset;
101:
102: public GreekAnalyzer() {
103: charset = GreekCharsets.UnicodeGreek;
104: stopSet = StopFilter
105: .makeStopSet(makeStopWords(GreekCharsets.UnicodeGreek));
106: }
107:
108: /**
109: * Builds an analyzer.
110: */
111: public GreekAnalyzer(char[] charset) {
112: this .charset = charset;
113: stopSet = StopFilter.makeStopSet(makeStopWords(charset));
114: }
115:
116: /**
117: * Builds an analyzer with the given stop words.
118: */
119: public GreekAnalyzer(char[] charset, String[] stopwords) {
120: this .charset = charset;
121: stopSet = StopFilter.makeStopSet(stopwords);
122: }
123:
124: // Takes greek stop words and translates them to a String array, using
125: // the given charset
126: private static String[] makeStopWords(char[] charset) {
127: String[] res = new String[GREEK_STOP_WORDS.length];
128: for (int i = 0; i < res.length; i++) {
129: char[] theStopWord = GREEK_STOP_WORDS[i];
130: // translate the word,using the charset
131: StringBuffer theWord = new StringBuffer();
132: for (int j = 0; j < theStopWord.length; j++) {
133: theWord.append(charset[theStopWord[j]]);
134: }
135: res[i] = theWord.toString();
136: }
137: return res;
138: }
139:
140: /**
141: * Builds an analyzer with the given stop words.
142: */
143: public GreekAnalyzer(char[] charset, Hashtable stopwords) {
144: this .charset = charset;
145: stopSet = new HashSet(stopwords.keySet());
146: }
147:
148: /**
149: * Creates a TokenStream which tokenizes all the text in the provided Reader.
150: *
151: * @return A TokenStream build from a StandardTokenizer filtered with
152: * GreekLowerCaseFilter and StopFilter
153: */
154: public TokenStream tokenStream(String fieldName, Reader reader) {
155: TokenStream result = new StandardTokenizer(reader);
156: result = new GreekLowerCaseFilter(result, charset);
157: result = new StopFilter(result, stopSet);
158: return result;
159: }
160: }
|