001: package org.apache.lucene.analysis.ngram;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Token;
021:
022: import java.io.StringReader;
023: import java.util.ArrayList;
024:
025: import junit.framework.TestCase;
026:
027: /**
028: * Tests {@link NGramTokenizer} for correctness.
029: * @author Otis Gospodnetic
030: */
031: public class NGramTokenizerTest extends TestCase {
032: private StringReader input;
033: private ArrayList tokens = new ArrayList();
034:
035: public void setUp() {
036: input = new StringReader("abcde");
037: }
038:
039: public void testInvalidInput() throws Exception {
040: boolean gotException = false;
041: try {
042: new NGramTokenizer(input, 2, 1);
043: } catch (IllegalArgumentException e) {
044: gotException = true;
045: }
046: assertTrue(gotException);
047: }
048:
049: public void testInvalidInput2() throws Exception {
050: boolean gotException = false;
051: try {
052: new NGramTokenizer(input, 0, 1);
053: } catch (IllegalArgumentException e) {
054: gotException = true;
055: }
056: assertTrue(gotException);
057: }
058:
059: public void testUnigrams() throws Exception {
060: NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
061:
062: Token token = null;
063: do {
064: token = tokenizer.next();
065: if (token != null) {
066: tokens.add(token.toString());
067: // System.out.println(token.termText());
068: // System.out.println(token);
069: // Thread.sleep(1000);
070: }
071: } while (token != null);
072:
073: assertEquals(5, tokens.size());
074: ArrayList exp = new ArrayList();
075: exp.add("(a,0,1)");
076: exp.add("(b,1,2)");
077: exp.add("(c,2,3)");
078: exp.add("(d,3,4)");
079: exp.add("(e,4,5)");
080: assertEquals(exp, tokens);
081: }
082:
083: public void testBigrams() throws Exception {
084: NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
085:
086: Token token = null;
087: do {
088: token = tokenizer.next();
089: if (token != null) {
090: tokens.add(token.toString());
091: // System.out.println(token.termText());
092: // System.out.println(token);
093: // Thread.sleep(1000);
094: }
095: } while (token != null);
096:
097: assertEquals(4, tokens.size());
098: ArrayList exp = new ArrayList();
099: exp.add("(ab,0,2)");
100: exp.add("(bc,1,3)");
101: exp.add("(cd,2,4)");
102: exp.add("(de,3,5)");
103: assertEquals(exp, tokens);
104: }
105:
106: public void testNgrams() throws Exception {
107: NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
108:
109: Token token = null;
110: do {
111: token = tokenizer.next();
112: if (token != null) {
113: tokens.add(token.toString());
114: // System.out.println(token.termText());
115: // System.out.println(token);
116: // Thread.sleep(1000);
117: }
118: } while (token != null);
119:
120: assertEquals(12, tokens.size());
121: ArrayList exp = new ArrayList();
122: exp.add("(a,0,1)");
123: exp.add("(b,1,2)");
124: exp.add("(c,2,3)");
125: exp.add("(d,3,4)");
126: exp.add("(e,4,5)");
127: exp.add("(ab,0,2)");
128: exp.add("(bc,1,3)");
129: exp.add("(cd,2,4)");
130: exp.add("(de,3,5)");
131: exp.add("(abc,0,3)");
132: exp.add("(bcd,1,4)");
133: exp.add("(cde,2,5)");
134: assertEquals(exp, tokens);
135: }
136:
137: public void testOversizedNgrams() throws Exception {
138: NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
139:
140: Token token = null;
141: do {
142: token = tokenizer.next();
143: if (token != null) {
144: tokens.add(token.toString());
145: // System.out.println(token.termText());
146: // System.out.println(token);
147: // Thread.sleep(1000);
148: }
149: } while (token != null);
150:
151: assertTrue(tokens.isEmpty());
152: }
153: }
|