001: package org.apache.lucene.analysis.cjk;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Token;
021: import org.apache.lucene.analysis.Tokenizer;
022:
023: import java.io.Reader;
024:
025: /**
026: * CJKTokenizer was modified from StopTokenizer which does a decent job for
027: * most European languages. It performs other token methods for double-byte
028: * Characters: the token will return at each two charactors with overlap match.<br>
029: * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
030: * also need filter filter zero length token ""<br>
031: * for Digit: digit, '+', '#' will token as letter<br>
032: * for more info on Asia language(Chinese Japanese Korean) text segmentation:
033: * please search <a
034: * href="http://www.google.com/search?q=word+chinese+segment">google</a>
035: *
036: * @author Che, Dong
037: */
038: public final class CJKTokenizer extends Tokenizer {
039: //~ Static fields/initializers ---------------------------------------------
040:
041: /** Max word length */
042: private static final int MAX_WORD_LEN = 255;
043:
044: /** buffer size: */
045: private static final int IO_BUFFER_SIZE = 256;
046:
047: //~ Instance fields --------------------------------------------------------
048:
049: /** word offset, used to imply which character(in ) is parsed */
050: private int offset = 0;
051:
052: /** the index used only for ioBuffer */
053: private int bufferIndex = 0;
054:
055: /** data length */
056: private int dataLen = 0;
057:
058: /**
059: * character buffer, store the characters which are used to compose <br>
060: * the returned Token
061: */
062: private final char[] buffer = new char[MAX_WORD_LEN];
063:
064: /**
065: * I/O buffer, used to store the content of the input(one of the <br>
066: * members of Tokenizer)
067: */
068: private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
069:
070: /** word type: single=>ASCII double=>non-ASCII word=>default */
071: private String tokenType = "word";
072:
073: /**
074: * tag: previous character is a cached double-byte character "C1C2C3C4"
075: * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
076: * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
077: */
078: private boolean preIsTokened = false;
079:
080: //~ Constructors -----------------------------------------------------------
081:
082: /**
083: * Construct a token stream processing the given input.
084: *
085: * @param in I/O reader
086: */
087: public CJKTokenizer(Reader in) {
088: input = in;
089: }
090:
091: //~ Methods ----------------------------------------------------------------
092:
093: /**
094: * Returns the next token in the stream, or null at EOS.
095: * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
096: * for detail.
097: *
098: * @return Token
099: *
100: * @throws java.io.IOException - throw IOException when read error <br>
101: * hanppened in the InputStream
102: *
103: */
104: public final Token next() throws java.io.IOException {
105: /** how many character(s) has been stored in buffer */
106: int length = 0;
107:
108: /** the position used to create Token */
109: int start = offset;
110:
111: while (true) {
112: /** current charactor */
113: char c;
114:
115: /** unicode block of current charactor for detail */
116: Character.UnicodeBlock ub;
117:
118: offset++;
119:
120: if (bufferIndex >= dataLen) {
121: dataLen = input.read(ioBuffer);
122: bufferIndex = 0;
123: }
124:
125: if (dataLen == -1) {
126: if (length > 0) {
127: if (preIsTokened == true) {
128: length = 0;
129: preIsTokened = false;
130: }
131:
132: break;
133: } else {
134: return null;
135: }
136: } else {
137: //get current character
138: c = ioBuffer[bufferIndex++];
139:
140: //get the UnicodeBlock of the current character
141: ub = Character.UnicodeBlock.of(c);
142: }
143:
144: //if the current character is ASCII or Extend ASCII
145: if ((ub == Character.UnicodeBlock.BASIC_LATIN)
146: || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)) {
147: if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
148: /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
149: int i = (int) c;
150: i = i - 65248;
151: c = (char) i;
152: }
153:
154: // if the current character is a letter or "_" "+" "#"
155: if (Character.isLetterOrDigit(c)
156: || ((c == '_') || (c == '+') || (c == '#'))) {
157: if (length == 0) {
158: // "javaC1C2C3C4linux" <br>
159: // ^--: the current character begin to token the ASCII
160: // letter
161: start = offset - 1;
162: } else if (tokenType == "double") {
163: // "javaC1C2C3C4linux" <br>
164: // ^--: the previous non-ASCII
165: // : the current character
166: offset--;
167: bufferIndex--;
168: tokenType = "single";
169:
170: if (preIsTokened == true) {
171: // there is only one non-ASCII has been stored
172: length = 0;
173: preIsTokened = false;
174:
175: break;
176: } else {
177: break;
178: }
179: }
180:
181: // store the LowerCase(c) in the buffer
182: buffer[length++] = Character.toLowerCase(c);
183: tokenType = "single";
184:
185: // break the procedure if buffer overflowed!
186: if (length == MAX_WORD_LEN) {
187: break;
188: }
189: } else if (length > 0) {
190: if (preIsTokened == true) {
191: length = 0;
192: preIsTokened = false;
193: } else {
194: break;
195: }
196: }
197: } else {
198: // non-ASCII letter, eg."C1C2C3C4"
199: if (Character.isLetter(c)) {
200: if (length == 0) {
201: start = offset - 1;
202: buffer[length++] = c;
203: tokenType = "double";
204: } else {
205: if (tokenType == "single") {
206: offset--;
207: bufferIndex--;
208:
209: //return the previous ASCII characters
210: break;
211: } else {
212: buffer[length++] = c;
213: tokenType = "double";
214:
215: if (length == 2) {
216: offset--;
217: bufferIndex--;
218: preIsTokened = true;
219:
220: break;
221: }
222: }
223: }
224: } else if (length > 0) {
225: if (preIsTokened == true) {
226: // empty the buffer
227: length = 0;
228: preIsTokened = false;
229: } else {
230: break;
231: }
232: }
233: }
234: }
235:
236: return new Token(new String(buffer, 0, length), start, start
237: + length, tokenType);
238: }
239: }
|