01: package org.apache.lucene.analysis;
02:
03: import java.io.IOException;
04:
05: /**
06: * Works in conjunction with the SinkTokenizer to provide the ability to set aside tokens
07: * that have already been analyzed. This is useful in situations where multiple fields share
08: * many common analysis steps and then go their separate ways.
09: * <p/>
10: * It is also useful for doing things like entity extraction or proper noun analysis as
11: * part of the analysis workflow and saving off those tokens for use in another field.
12: *
13: * <pre>
14: SinkTokenizer sink1 = new SinkTokenizer(null);
15: SinkTokenizer sink2 = new SinkTokenizer(null);
16:
17: TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
18: TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
19:
20: TokenStream final1 = new LowerCaseFilter(source1);
21: TokenStream final2 = source2;
22: TokenStream final3 = new EntityDetect(sink1);
23: TokenStream final4 = new URLDetect(sink2);
24:
25: d.add(new Field("f1", final1));
26: d.add(new Field("f2", final2));
27: d.add(new Field("f3", final3));
28: d.add(new Field("f4", final4));
29: * </pre>
30: * In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer
31: and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
32: Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
33: <p/>
34: *
35: * See http://issues.apache.org/jira/browse/LUCENE-1058
36: * @see SinkTokenizer
37: *
38: **/
39: public class TeeTokenFilter extends TokenFilter {
40: SinkTokenizer sink;
41:
42: public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
43: super (input);
44: this .sink = sink;
45: }
46:
47: public Token next(Token result) throws IOException {
48: Token t = input.next(result);
49: sink.add(t);
50: return t;
51: }
52:
53: }
|