01: package org.apache.lucene.analysis;
02:
03: /**
04: * Licensed to the Apache Software Foundation (ASF) under one or more
05: * contributor license agreements. See the NOTICE file distributed with
06: * this work for additional information regarding copyright ownership.
07: * The ASF licenses this file to You under the Apache License, Version 2.0
08: * (the "License"); you may not use this file except in compliance with
09: * the License. You may obtain a copy of the License at
10: *
11: * http://www.apache.org/licenses/LICENSE-2.0
12: *
13: * Unless required by applicable law or agreed to in writing, software
14: * distributed under the License is distributed on an "AS IS" BASIS,
15: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16: * See the License for the specific language governing permissions and
17: * limitations under the License.
18: */
19:
20: import java.io.IOException;
21: import java.io.Reader;
22:
23: /** An abstract base class for simple, character-oriented tokenizers.*/
24: public abstract class CharTokenizer extends Tokenizer {
25: public CharTokenizer(Reader input) {
26: super (input);
27: }
28:
29: private int offset = 0, bufferIndex = 0, dataLen = 0;
30: private static final int MAX_WORD_LEN = 255;
31: private static final int IO_BUFFER_SIZE = 4096;
32: private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
33:
34: /** Returns true iff a character should be included in a token. This
35: * tokenizer generates as tokens adjacent sequences of characters which
36: * satisfy this predicate. Characters for which this is false are used to
37: * define token boundaries and are not included in tokens. */
38: protected abstract boolean isTokenChar(char c);
39:
40: /** Called on each token character to normalize it before it is added to the
41: * token. The default implementation does nothing. Subclasses may use this
42: * to, e.g., lowercase tokens. */
43: protected char normalize(char c) {
44: return c;
45: }
46:
47: public final Token next(Token token) throws IOException {
48: token.clear();
49: int length = 0;
50: int start = bufferIndex;
51: char[] buffer = token.termBuffer();
52: while (true) {
53:
54: if (bufferIndex >= dataLen) {
55: offset += dataLen;
56: dataLen = input.read(ioBuffer);
57: if (dataLen == -1) {
58: if (length > 0)
59: break;
60: else
61: return null;
62: }
63: bufferIndex = 0;
64: }
65:
66: final char c = ioBuffer[bufferIndex++];
67:
68: if (isTokenChar(c)) { // if it's a token char
69:
70: if (length == 0) // start of token
71: start = offset + bufferIndex - 1;
72: else if (length == buffer.length)
73: buffer = token.resizeTermBuffer(1 + length);
74:
75: buffer[length++] = normalize(c); // buffer it, normalized
76:
77: if (length == MAX_WORD_LEN) // buffer overflow!
78: break;
79:
80: } else if (length > 0) // at non-Letter w/ chars
81: break; // return 'em
82: }
83:
84: token.termLength = length;
85: token.startOffset = start;
86: token.endOffset = start + length;
87: return token;
88: }
89:
90: public void reset(Reader input) throws IOException {
91: super .reset(input);
92: bufferIndex = 0;
93: offset = 0;
94: dataLen = 0;
95: }
96: }
|