001: package org.apache.lucene.analysis;
002:
003: /**
004: * Copyright 2004 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import junit.framework.TestCase;
020: import org.apache.lucene.analysis.standard.StandardFilter;
021: import org.apache.lucene.analysis.standard.StandardTokenizer;
022: import org.apache.lucene.util.English;
023:
024: import java.io.IOException;
025: import java.io.StringReader;
026: import java.util.ArrayList;
027: import java.util.List;
028:
029: /**
030: * tests for the TeeTokenFilter and SinkTokenizer
031: */
032: public class TeeSinkTokenTest extends TestCase {
033: protected StringBuffer buffer1;
034: protected StringBuffer buffer2;
035: protected String[] tokens1;
036: protected String[] tokens2;
037:
038: public TeeSinkTokenTest(String s) {
039: super (s);
040: }
041:
042: protected void setUp() {
043: tokens1 = new String[] { "The", "quick", "Burgundy", "Fox",
044: "jumped", "over", "the", "lazy", "Red", "Dogs" };
045: tokens2 = new String[] { "The", "Lazy", "Dogs", "should",
046: "stay", "on", "the", "porch" };
047: buffer1 = new StringBuffer();
048:
049: for (int i = 0; i < tokens1.length; i++) {
050: buffer1.append(tokens1[i]).append(' ');
051: }
052: buffer2 = new StringBuffer();
053: for (int i = 0; i < tokens2.length; i++) {
054: buffer2.append(tokens2[i]).append(' ');
055:
056: }
057: }
058:
059: protected void tearDown() {
060:
061: }
062:
063: public void test() throws IOException {
064:
065: SinkTokenizer sink1 = new SinkTokenizer(null) {
066: public void add(Token t) {
067: if (t != null && t.termText().equalsIgnoreCase("The")) {
068: super .add(t);
069: }
070: }
071: };
072: TokenStream source = new TeeTokenFilter(
073: new WhitespaceTokenizer(new StringReader(buffer1
074: .toString())), sink1);
075: Token token = null;
076: int i = 0;
077: while ((token = source.next()) != null) {
078: assertTrue(token.termText() + " is not equal to "
079: + tokens1[i],
080: token.termText().equals(tokens1[i]) == true);
081: i++;
082: }
083: assertTrue(i + " does not equal: " + tokens1.length,
084: i == tokens1.length);
085: assertTrue("sink1 Size: " + sink1.getTokens().size()
086: + " is not: " + 2, sink1.getTokens().size() == 2);
087: i = 0;
088: while ((token = sink1.next()) != null) {
089: assertTrue(token.termText() + " is not equal to " + "The",
090: token.termText().equalsIgnoreCase("The") == true);
091: i++;
092: }
093: assertTrue(i + " does not equal: " + sink1.getTokens().size(),
094: i == sink1.getTokens().size());
095: }
096:
097: public void testMultipleSources() throws Exception {
098: SinkTokenizer theDetector = new SinkTokenizer(null) {
099: public void add(Token t) {
100: if (t != null && t.termText().equalsIgnoreCase("The")) {
101: super .add(t);
102: }
103: }
104: };
105: SinkTokenizer dogDetector = new SinkTokenizer(null) {
106: public void add(Token t) {
107: if (t != null && t.termText().equalsIgnoreCase("Dogs")) {
108: super .add(t);
109: }
110: }
111: };
112: TokenStream source1 = new CachingTokenFilter(
113: new TeeTokenFilter(new TeeTokenFilter(
114: new WhitespaceTokenizer(new StringReader(
115: buffer1.toString())), theDetector),
116: dogDetector));
117: TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(
118: new WhitespaceTokenizer(new StringReader(buffer2
119: .toString())), theDetector), dogDetector);
120: Token token = null;
121: int i = 0;
122: while ((token = source1.next()) != null) {
123: assertTrue(token.termText() + " is not equal to "
124: + tokens1[i],
125: token.termText().equals(tokens1[i]) == true);
126: i++;
127: }
128: assertTrue(i + " does not equal: " + tokens1.length,
129: i == tokens1.length);
130: assertTrue("theDetector Size: "
131: + theDetector.getTokens().size() + " is not: " + 2,
132: theDetector.getTokens().size() == 2);
133: assertTrue("dogDetector Size: "
134: + dogDetector.getTokens().size() + " is not: " + 1,
135: dogDetector.getTokens().size() == 1);
136: i = 0;
137: while ((token = source2.next()) != null) {
138: assertTrue(token.termText() + " is not equal to "
139: + tokens2[i],
140: token.termText().equals(tokens2[i]) == true);
141: i++;
142: }
143: assertTrue(i + " does not equal: " + tokens2.length,
144: i == tokens2.length);
145: assertTrue("theDetector Size: "
146: + theDetector.getTokens().size() + " is not: " + 4,
147: theDetector.getTokens().size() == 4);
148: assertTrue("dogDetector Size: "
149: + dogDetector.getTokens().size() + " is not: " + 2,
150: dogDetector.getTokens().size() == 2);
151: i = 0;
152: while ((token = theDetector.next()) != null) {
153: assertTrue(token.termText() + " is not equal to " + "The",
154: token.termText().equalsIgnoreCase("The") == true);
155: i++;
156: }
157: assertTrue(i + " does not equal: "
158: + theDetector.getTokens().size(), i == theDetector
159: .getTokens().size());
160: i = 0;
161: while ((token = dogDetector.next()) != null) {
162: assertTrue(token.termText() + " is not equal to " + "Dogs",
163: token.termText().equalsIgnoreCase("Dogs") == true);
164: i++;
165: }
166: assertTrue(i + " does not equal: "
167: + dogDetector.getTokens().size(), i == dogDetector
168: .getTokens().size());
169: source1.reset();
170: TokenStream lowerCasing = new LowerCaseFilter(source1);
171: i = 0;
172: while ((token = lowerCasing.next()) != null) {
173: assertTrue(token.termText() + " is not equal to "
174: + tokens1[i].toLowerCase(), token.termText()
175: .equals(tokens1[i].toLowerCase()) == true);
176: i++;
177: }
178: assertTrue(i + " does not equal: " + tokens1.length,
179: i == tokens1.length);
180: }
181:
182: /**
183: * Not an explicit test, just useful to print out some info on performance
184: *
185: * @throws Exception
186: */
187: public void testPerformance() throws Exception {
188: int[] tokCount = { 100, 500, 1000, 2000, 5000, 10000 };
189: int[] modCounts = { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
190: for (int k = 0; k < tokCount.length; k++) {
191: StringBuffer buffer = new StringBuffer();
192: System.out.println("-----Tokens: " + tokCount[k] + "-----");
193: for (int i = 0; i < tokCount[k]; i++) {
194: buffer.append(English.intToEnglish(i).toUpperCase())
195: .append(' ');
196: }
197: //make sure we produce the same tokens
198: ModuloSinkTokenizer sink = new ModuloSinkTokenizer(
199: tokCount[k], 100);
200: Token next = new Token();
201: TokenStream result = new TeeTokenFilter(new StandardFilter(
202: new StandardTokenizer(new StringReader(buffer
203: .toString()))), sink);
204: while ((next = result.next(next)) != null) {
205: }
206: result = new ModuloTokenFilter(new StandardFilter(
207: new StandardTokenizer(new StringReader(buffer
208: .toString()))), 100);
209: next = new Token();
210: List tmp = new ArrayList();
211: while ((next = result.next(next)) != null) {
212: tmp.add(next.clone());
213: }
214: List sinkList = sink.getTokens();
215: assertTrue("tmp Size: " + tmp.size() + " is not: "
216: + sinkList.size(), tmp.size() == sinkList.size());
217: for (int i = 0; i < tmp.size(); i++) {
218: Token tfTok = (Token) tmp.get(i);
219: Token sinkTok = (Token) sinkList.get(i);
220: assertTrue(tfTok.termText() + " is not equal to "
221: + sinkTok.termText() + " at token: " + i, tfTok
222: .termText().equals(sinkTok.termText()) == true);
223: }
224: //simulate two fields, each being analyzed once, for 20 documents
225:
226: for (int j = 0; j < modCounts.length; j++) {
227: int tfPos = 0;
228: long start = System.currentTimeMillis();
229: for (int i = 0; i < 20; i++) {
230: next = new Token();
231: result = new StandardFilter(new StandardTokenizer(
232: new StringReader(buffer.toString())));
233: while ((next = result.next(next)) != null) {
234: tfPos += next.getPositionIncrement();
235: }
236: next = new Token();
237: result = new ModuloTokenFilter(new StandardFilter(
238: new StandardTokenizer(new StringReader(
239: buffer.toString()))), modCounts[j]);
240: while ((next = result.next(next)) != null) {
241: tfPos += next.getPositionIncrement();
242: }
243: }
244: long finish = System.currentTimeMillis();
245: System.out.println("ModCount: " + modCounts[j]
246: + " Two fields took " + (finish - start)
247: + " ms");
248: int sinkPos = 0;
249: //simulate one field with one sink
250: start = System.currentTimeMillis();
251: for (int i = 0; i < 20; i++) {
252: sink = new ModuloSinkTokenizer(tokCount[k],
253: modCounts[j]);
254: next = new Token();
255: result = new TeeTokenFilter(new StandardFilter(
256: new StandardTokenizer(new StringReader(
257: buffer.toString()))), sink);
258: while ((next = result.next(next)) != null) {
259: sinkPos += next.getPositionIncrement();
260: }
261: //System.out.println("Modulo--------");
262: result = sink;
263: while ((next = result.next(next)) != null) {
264: sinkPos += next.getPositionIncrement();
265: }
266: }
267: finish = System.currentTimeMillis();
268: System.out.println("ModCount: " + modCounts[j]
269: + " Tee fields took " + (finish - start)
270: + " ms");
271: assertTrue(sinkPos + " does not equal: " + tfPos,
272: sinkPos == tfPos);
273:
274: }
275: System.out
276: .println("- End Tokens: " + tokCount[k] + "-----");
277: }
278:
279: }
280:
281: class ModuloTokenFilter extends TokenFilter {
282:
283: int modCount;
284:
285: ModuloTokenFilter(TokenStream input, int mc) {
286: super (input);
287: modCount = mc;
288: }
289:
290: int count = 0;
291:
292: //return every 100 tokens
293: public Token next(Token result) throws IOException {
294:
295: while ((result = input.next(result)) != null
296: && count % modCount != 0) {
297: count++;
298: }
299: count++;
300: return result;
301: }
302: }
303:
304: class ModuloSinkTokenizer extends SinkTokenizer {
305: int count = 0;
306: int modCount;
307:
308: ModuloSinkTokenizer(int numToks, int mc) {
309: modCount = mc;
310: lst = new ArrayList(numToks % mc);
311: }
312:
313: public void add(Token t) {
314: if (t != null && count % modCount == 0) {
315: lst.add(t.clone());
316: }
317: count++;
318: }
319: }
320: }
|