001: package org.apache.lucene.analysis.cn;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.Reader;
021: import org.apache.lucene.analysis.*;
022:
023: /**
024: * Title: ChineseTokenizer
025: * Description: Extract tokens from the Stream using Character.getType()
026: * Rule: A Chinese character as a single token
027: * Copyright: Copyright (c) 2001
028: * Company:
029: *
030: * The difference between thr ChineseTokenizer and the
031: * CJKTokenizer (id=23545) is that they have different
032: * token parsing logic.
033: *
034: * Let me use an example. If having a Chinese text
035: * "C1C2C3C4" to be indexed, the tokens returned from the
036: * ChineseTokenizer are C1, C2, C3, C4. And the tokens
037: * returned from the CJKTokenizer are C1C2, C2C3, C3C4.
038: *
039: * Therefore the index the CJKTokenizer created is much
040: * larger.
041: *
042: * The problem is that when searching for C1, C1C2, C1C3,
043: * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
044: * CJKTokenizer will not work.
045: *
046: * @author Yiyi Sun
047: * @version 1.0
048: *
049: */
050:
051: public final class ChineseTokenizer extends Tokenizer {
052:
053: public ChineseTokenizer(Reader in) {
054: input = in;
055: }
056:
057: private int offset = 0, bufferIndex = 0, dataLen = 0;
058: private final static int MAX_WORD_LEN = 255;
059: private final static int IO_BUFFER_SIZE = 1024;
060: private final char[] buffer = new char[MAX_WORD_LEN];
061: private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
062:
063: private int length;
064: private int start;
065:
066: private final void push(char c) {
067:
068: if (length == 0)
069: start = offset - 1; // start of token
070: buffer[length++] = Character.toLowerCase(c); // buffer it
071:
072: }
073:
074: private final Token flush() {
075:
076: if (length > 0) {
077: //System.out.println(new String(buffer, 0, length));
078: return new Token(new String(buffer, 0, length), start,
079: start + length);
080: } else
081: return null;
082: }
083:
084: public final Token next() throws java.io.IOException {
085:
086: length = 0;
087: start = offset;
088:
089: while (true) {
090:
091: final char c;
092: offset++;
093:
094: if (bufferIndex >= dataLen) {
095: dataLen = input.read(ioBuffer);
096: bufferIndex = 0;
097: }
098:
099: if (dataLen == -1)
100: return flush();
101: else
102: c = ioBuffer[bufferIndex++];
103:
104: switch (Character.getType(c)) {
105:
106: case Character.DECIMAL_DIGIT_NUMBER:
107: case Character.LOWERCASE_LETTER:
108: case Character.UPPERCASE_LETTER:
109: push(c);
110: if (length == MAX_WORD_LEN)
111: return flush();
112: break;
113:
114: case Character.OTHER_LETTER:
115: if (length > 0) {
116: bufferIndex--;
117: offset--;
118: return flush();
119: }
120: push(c);
121: return flush();
122:
123: default:
124: if (length > 0)
125: return flush();
126: break;
127: }
128: }
129:
130: }
131: }
|