001: package org.apache.lucene.analysis.ru;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import junit.framework.TestCase;
021:
022: import java.io.*;
023:
024: import org.apache.lucene.analysis.TokenStream;
025: import org.apache.lucene.analysis.Token;
026:
027: /**
028: * Test case for RussianAnalyzer.
029: *
030: *
031: * @version $Id: TestRussianAnalyzer.java 564236 2007-08-09 15:21:19Z gsingers $
032: */
033:
034: public class TestRussianAnalyzer extends TestCase {
035: private InputStreamReader inWords;
036:
037: private InputStreamReader sampleUnicode;
038:
039: private Reader inWordsKOI8;
040:
041: private Reader sampleKOI8;
042:
043: private Reader inWords1251;
044:
045: private Reader sample1251;
046:
047: private File dataDir;
048:
049: protected void setUp() throws Exception {
050: dataDir = new File(System.getProperty("dataDir", "./bin"));
051: }
052:
053: public void testUnicode() throws IOException {
054: RussianAnalyzer ra = new RussianAnalyzer(
055: RussianCharsets.UnicodeRussian);
056: inWords = new InputStreamReader(new FileInputStream(new File(
057: dataDir,
058: "/org/apache/lucene/analysis/ru/testUnicode.txt")),
059: "Unicode");
060:
061: sampleUnicode = new InputStreamReader(
062: new FileInputStream(
063: new File(dataDir,
064: "/org/apache/lucene/analysis/ru/resUnicode.htm")),
065: "Unicode");
066:
067: TokenStream in = ra.tokenStream("all", inWords);
068:
069: RussianLetterTokenizer sample = new RussianLetterTokenizer(
070: sampleUnicode, RussianCharsets.UnicodeRussian);
071:
072: for (;;) {
073: Token token = in.next();
074:
075: if (token == null) {
076: break;
077: }
078:
079: Token sampleToken = sample.next();
080: assertEquals("Unicode", token.termText(),
081: sampleToken == null ? null : sampleToken.termText());
082: }
083:
084: inWords.close();
085: sampleUnicode.close();
086: }
087:
088: public void testKOI8() throws IOException {
089: //System.out.println(new java.util.Date());
090: RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
091: // KOI8
092: inWordsKOI8 = new InputStreamReader(
093: new FileInputStream(new File(dataDir,
094: "/org/apache/lucene/analysis/ru/testKOI8.txt")),
095: "iso-8859-1");
096:
097: sampleKOI8 = new InputStreamReader(new FileInputStream(
098: new File(dataDir,
099: "/org/apache/lucene/analysis/ru/resKOI8.htm")),
100: "iso-8859-1");
101:
102: TokenStream in = ra.tokenStream("all", inWordsKOI8);
103: RussianLetterTokenizer sample = new RussianLetterTokenizer(
104: sampleKOI8, RussianCharsets.KOI8);
105:
106: for (;;) {
107: Token token = in.next();
108:
109: if (token == null) {
110: break;
111: }
112:
113: Token sampleToken = sample.next();
114: assertEquals("KOI8", token.termText(),
115: sampleToken == null ? null : sampleToken.termText());
116:
117: }
118:
119: inWordsKOI8.close();
120: sampleKOI8.close();
121: }
122:
123: public void test1251() throws IOException {
124: // 1251
125: inWords1251 = new InputStreamReader(
126: new FileInputStream(new File(dataDir,
127: "/org/apache/lucene/analysis/ru/test1251.txt")),
128: "iso-8859-1");
129:
130: sample1251 = new InputStreamReader(new FileInputStream(
131: new File(dataDir,
132: "/org/apache/lucene/analysis/ru/res1251.htm")),
133: "iso-8859-1");
134:
135: RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
136: TokenStream in = ra.tokenStream("", inWords1251);
137: RussianLetterTokenizer sample = new RussianLetterTokenizer(
138: sample1251, RussianCharsets.CP1251);
139:
140: for (;;) {
141: Token token = in.next();
142:
143: if (token == null) {
144: break;
145: }
146:
147: Token sampleToken = sample.next();
148: assertEquals("1251", token.termText(),
149: sampleToken == null ? null : sampleToken.termText());
150:
151: }
152:
153: inWords1251.close();
154: sample1251.close();
155: }
156: }
|