001: package org.apache.lucene.analysis.cn;
002:
003: /* ====================================================================
004: * The Apache Software License, Version 1.1
005: *
006: * Copyright (c) 2004 The Apache Software Foundation. All rights
007: * reserved.
008: *
009: * Redistribution and use in source and binary forms, with or without
010: * modification, are permitted provided that the following conditions
011: * are met:
012: *
013: * 1. Redistributions of source code must retain the above copyright
014: * notice, this list of conditions and the following disclaimer.
015: *
016: * 2. Redistributions in binary form must reproduce the above copyright
017: * notice, this list of conditions and the following disclaimer in
018: * the documentation and/or other materials provided with the
019: * distribution.
020: *
021: * 3. The end-user documentation included with the redistribution,
022: * if any, must include the following acknowledgment:
023: * "This product includes software developed by the
024: * Apache Software Foundation (http://www.apache.org/)."
025: * Alternately, this acknowledgment may appear in the software itself,
026: * if and wherever such third-party acknowledgments normally appear.
027: *
028: * 4. The names "Apache" and "Apache Software Foundation" and
029: * "Apache Lucene" must not be used to endorse or promote products
030: * derived from this software without prior written permission. For
031: * written permission, please contact apache@apache.org.
032: *
033: * 5. Products derived from this software may not be called "Apache",
034: * "Apache Lucene", nor may "Apache" appear in their name, without
035: * prior written permission of the Apache Software Foundation.
036: *
037: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
038: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
039: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
040: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
041: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
042: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
043: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
044: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
045: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
046: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
047: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
048: * SUCH DAMAGE.
049: * ====================================================================
050: *
051: * This software consists of voluntary contributions made by many
052: * individuals on behalf of the Apache Software Foundation. For more
053: * information on the Apache Software Foundation, please see
054: * <http://www.apache.org/>.
055: */
056:
057: import java.io.Reader;
058: import org.apache.lucene.analysis.*;
059:
060: /**
061: * Title: ChineseTokenizer
062: * Description: Extract tokens from the Stream using Character.getType()
063: * Rule: A Chinese character as a single token
064: * Copyright: Copyright (c) 2001
065: * Company:
066: *
067: * The difference between thr ChineseTokenizer and the
068: * CJKTokenizer (id=23545) is that they have different
069: * token parsing logic.
070: *
071: * Let me use an example. If having a Chinese text
072: * "C1C2C3C4" to be indexed, the tokens returned from the
073: * ChineseTokenizer are C1, C2, C3, C4. And the tokens
074: * returned from the CJKTokenizer are C1C2, C2C3, C3C4.
075: *
076: * Therefore the index the CJKTokenizer created is much
077: * larger.
078: *
079: * The problem is that when searching for C1, C1C2, C1C3,
080: * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
081: * CJKTokenizer will not work.
082: *
083: * @author Yiyi Sun
084: * @version 1.0
085: *
086: */
087:
088: public final class ChineseTokenizer extends Tokenizer {
089:
090: public ChineseTokenizer(Reader in) {
091: input = in;
092: }
093:
094: private int offset = 0, bufferIndex = 0, dataLen = 0;
095: private final static int MAX_WORD_LEN = 255;
096: private final static int IO_BUFFER_SIZE = 1024;
097: private final char[] buffer = new char[MAX_WORD_LEN];
098: private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
099:
100: private int length;
101: private int start;
102:
103: private final void push(char c) {
104:
105: if (length == 0)
106: start = offset - 1; // start of token
107: buffer[length++] = Character.toLowerCase(c); // buffer it
108:
109: }
110:
111: private final Token flush() {
112:
113: if (length > 0) {
114: //System.out.println(new String(buffer, 0, length));
115: return new Token(new String(buffer, 0, length), start,
116: start + length);
117: } else
118: return null;
119: }
120:
121: public final Token next() throws java.io.IOException {
122:
123: length = 0;
124: start = offset;
125:
126: while (true) {
127:
128: final char c;
129: offset++;
130:
131: if (bufferIndex >= dataLen) {
132: dataLen = input.read(ioBuffer);
133: bufferIndex = 0;
134: }
135: ;
136:
137: if (dataLen == -1)
138: return flush();
139: else
140: c = ioBuffer[bufferIndex++];
141:
142: switch (Character.getType(c)) {
143:
144: case Character.DECIMAL_DIGIT_NUMBER:
145: case Character.LOWERCASE_LETTER:
146: case Character.UPPERCASE_LETTER:
147: push(c);
148: if (length == MAX_WORD_LEN)
149: return flush();
150: break;
151:
152: case Character.OTHER_LETTER:
153: if (length > 0) {
154: bufferIndex--;
155: return flush();
156: }
157: push(c);
158: return flush();
159:
160: default:
161: if (length > 0)
162: return flush();
163: break;
164: }
165: }
166:
167: }
168: }
|