01: package org.apache.lucene.analysis.standard;
02:
03: /**
04: * Licensed to the Apache Software Foundation (ASF) under one or more
05: * contributor license agreements. See the NOTICE file distributed with
06: * this work for additional information regarding copyright ownership.
07: * The ASF licenses this file to You under the Apache License, Version 2.0
08: * (the "License"); you may not use this file except in compliance with
09: * the License. You may obtain a copy of the License at
10: *
11: * http://www.apache.org/licenses/LICENSE-2.0
12: *
13: * Unless required by applicable law or agreed to in writing, software
14: * distributed under the License is distributed on an "AS IS" BASIS,
15: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16: * See the License for the specific language governing permissions and
17: * limitations under the License.
18: */
19:
20: import org.apache.lucene.analysis.TokenFilter;
21: import org.apache.lucene.analysis.Token;
22: import org.apache.lucene.analysis.TokenStream;
23:
24: /** Normalizes tokens extracted with {@link StandardTokenizer}. */
25:
26: public final class StandardFilter extends TokenFilter {
27:
28: /** Construct filtering <i>in</i>. */
29: public StandardFilter(TokenStream in) {
30: super (in);
31: }
32:
33: private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
34: private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
35:
36: /** Returns the next token in the stream, or null at EOS.
37: * <p>Removes <tt>'s</tt> from the end of words.
38: * <p>Removes dots from acronyms.
39: */
40: public final Token next(Token result) throws java.io.IOException {
41: Token t = input.next(result);
42:
43: if (t == null)
44: return null;
45:
46: char[] buffer = t.termBuffer();
47: final int bufferLength = t.termLength();
48: final String type = t.type();
49:
50: if (type == APOSTROPHE_TYPE
51: && // remove 's
52: bufferLength >= 2
53: && buffer[bufferLength - 2] == '\''
54: && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) {
55: // Strip last 2 characters off
56: t.setTermLength(bufferLength - 2);
57: } else if (type == ACRONYM_TYPE) { // remove dots
58: int upto = 0;
59: for (int i = 0; i < bufferLength; i++) {
60: char c = buffer[i];
61: if (c != '.')
62: buffer[upto++] = c;
63: }
64: t.setTermLength(upto);
65: }
66:
67: return t;
68: }
69: }
|