01: package org.apache.lucene.analysis;
02:
03: /**
04: * Licensed to the Apache Software Foundation (ASF) under one or more
05: * contributor license agreements. See the NOTICE file distributed with
06: * this work for additional information regarding copyright ownership.
07: * The ASF licenses this file to You under the Apache License, Version 2.0
08: * (the "License"); you may not use this file except in compliance with
09: * the License. You may obtain a copy of the License at
10: *
11: * http://www.apache.org/licenses/LICENSE-2.0
12: *
13: * Unless required by applicable law or agreed to in writing, software
14: * distributed under the License is distributed on an "AS IS" BASIS,
15: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16: * See the License for the specific language governing permissions and
17: * limitations under the License.
18: */
19:
20: import java.io.Reader;
21: import java.io.IOException;
22:
23: /** An Analyzer builds TokenStreams, which analyze text. It thus represents a
24: * policy for extracting index terms from text.
25: * <p>
26: * Typical implementations first build a Tokenizer, which breaks the stream of
27: * characters from the Reader into raw Tokens. One or more TokenFilters may
28: * then be applied to the output of the Tokenizer.
29: * <p>
30: * WARNING: You must override one of the methods defined by this class in your
31: * subclass or the Analyzer will enter an infinite loop.
32: */
33: public abstract class Analyzer {
34: /** Creates a TokenStream which tokenizes all the text in the provided
35: Reader. Default implementation forwards to tokenStream(Reader) for
36: compatibility with older version. Override to allow Analyzer to choose
37: strategy based on document and/or field. Must be able to handle null
38: field name for backward compatibility. */
39: public abstract TokenStream tokenStream(String fieldName,
40: Reader reader);
41:
42: /** Creates a TokenStream that is allowed to be re-used
43: * from the previous time that the same thread called
44: * this method. Callers that do not need to use more
45: * than one TokenStream at the same time from this
46: * analyzer should use this method for better
47: * performance.
48: */
49: public TokenStream reusableTokenStream(String fieldName,
50: Reader reader) throws IOException {
51: return tokenStream(fieldName, reader);
52: }
53:
54: private ThreadLocal tokenStreams = new ThreadLocal();
55:
56: /** Used by Analyzers that implement reusableTokenStream
57: * to retrieve previously saved TokenStreams for re-use
58: * by the same thread. */
59: protected Object getPreviousTokenStream() {
60: return tokenStreams.get();
61: }
62:
63: /** Used by Analyzers that implement reusableTokenStream
64: * to save a TokenStream for later re-use by the same
65: * thread. */
66: protected void setPreviousTokenStream(Object obj) {
67: tokenStreams.set(obj);
68: }
69:
70: /**
71: * Invoked before indexing a Fieldable instance if
72: * terms have already been added to that field. This allows custom
73: * analyzers to place an automatic position increment gap between
74: * Fieldable instances using the same field name. The default value
75: * position increment gap is 0. With a 0 position increment gap and
76: * the typical default token position increment of 1, all terms in a field,
77: * including across Fieldable instances, are in successive positions, allowing
78: * exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
79: *
80: * @param fieldName Fieldable name being indexed.
81: * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
82: */
83: public int getPositionIncrementGap(String fieldName) {
84: return 0;
85: }
86: }
|