001: package org.apache.lucene.analysis.cjk;
002:
003: /* ====================================================================
004: * The Apache Software License, Version 1.1
005: *
006: * Copyright (c) 2004 The Apache Software Foundation. All rights
007: * reserved.
008: *
009: * Redistribution and use in source and binary forms, with or without
010: * modification, are permitted provided that the following conditions
011: * are met:
012: *
013: * 1. Redistributions of source code must retain the above copyright
014: * notice, this list of conditions and the following disclaimer.
015: *
016: * 2. Redistributions in binary form must reproduce the above copyright
017: * notice, this list of conditions and the following disclaimer in
018: * the documentation and/or other materials provided with the
019: * distribution.
020: *
021: * 3. The end-user documentation included with the redistribution,
022: * if any, must include the following acknowledgment:
023: * "This product includes software developed by the
024: * Apache Software Foundation (http://www.apache.org/)."
025: * Alternately, this acknowledgment may appear in the software itself,
026: * if and wherever such third-party acknowledgments normally appear.
027: *
028: * 4. The names "Apache" and "Apache Software Foundation" and
029: * "Apache Lucene" must not be used to endorse or promote products
030: * derived from this software without prior written permission. For
031: * written permission, please contact apache@apache.org.
032: *
033: * 5. Products derived from this software may not be called "Apache",
034: * "Apache Lucene", nor may "Apache" appear in their name, without
035: * prior written permission of the Apache Software Foundation.
036: *
037: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
038: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
039: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
040: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
041: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
042: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
043: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
044: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
045: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
046: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
047: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
048: * SUCH DAMAGE.
049: * ====================================================================
050: *
051: * This software consists of voluntary contributions made by many
052: * individuals on behalf of the Apache Software Foundation. For more
053: * information on the Apache Software Foundation, please see
054: * <http://www.apache.org/>.
055: */
056:
057: import org.apache.lucene.analysis.Token;
058: import org.apache.lucene.analysis.Tokenizer;
059:
060: import java.io.Reader;
061:
062: /**
063: * CJKTokenizer was modified from StopTokenizer which does a decent job for
064: * most European languages. It performs other token methods for double-byte
065: * Characters: the token will return at each two charactors with overlap match.<br>
066: * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
067: * also need filter filter zero length token ""<br>
068: * for Digit: digit, '+', '#' will token as letter<br>
069: * for more info on Asia language(Chinese Japanese Korean) text segmentation:
070: * please search <a
071: * href="http://www.google.com/search?q=word+chinese+segment">google</a>
072: *
073: * @author Che, Dong
074: */
075: public final class CJKTokenizer extends Tokenizer {
076: //~ Static fields/initializers ---------------------------------------------
077:
078: /** Max word length */
079: private static final int MAX_WORD_LEN = 255;
080:
081: /** buffer size: */
082: private static final int IO_BUFFER_SIZE = 256;
083:
084: //~ Instance fields --------------------------------------------------------
085:
086: /** word offset, used to imply which character(in ) is parsed */
087: private int offset = 0;
088:
089: /** the index used only for ioBuffer */
090: private int bufferIndex = 0;
091:
092: /** data length */
093: private int dataLen = 0;
094:
095: /**
096: * character buffer, store the characters which are used to compose <br>
097: * the returned Token
098: */
099: private final char[] buffer = new char[MAX_WORD_LEN];
100:
101: /**
102: * I/O buffer, used to store the content of the input(one of the <br>
103: * members of Tokenizer)
104: */
105: private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
106:
107: /** word type: single=>ASCII double=>non-ASCII word=>default */
108: private String tokenType = "word";
109:
110: /**
111: * tag: previous character is a cached double-byte character "C1C2C3C4"
112: * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
113: * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
114: */
115: private boolean preIsTokened = false;
116:
117: //~ Constructors -----------------------------------------------------------
118:
119: /**
120: * Construct a token stream processing the given input.
121: *
122: * @param in I/O reader
123: */
124: public CJKTokenizer(Reader in) {
125: input = in;
126: }
127:
128: //~ Methods ----------------------------------------------------------------
129:
130: /**
131: * Returns the next token in the stream, or null at EOS.
132: * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
133: * for detail.
134: *
135: * @return Token
136: *
137: * @throws java.io.IOException - throw IOException when read error <br>
138: * hanppened in the InputStream
139: *
140: */
141: public final Token next() throws java.io.IOException {
142: /** how many character(s) has been stored in buffer */
143: int length = 0;
144:
145: /** the position used to create Token */
146: int start = offset;
147:
148: while (true) {
149: /** current charactor */
150: char c;
151:
152: /** unicode block of current charactor for detail */
153: Character.UnicodeBlock ub;
154:
155: offset++;
156:
157: if (bufferIndex >= dataLen) {
158: dataLen = input.read(ioBuffer);
159: bufferIndex = 0;
160: }
161:
162: if (dataLen == -1) {
163: if (length > 0) {
164: if (preIsTokened == true) {
165: length = 0;
166: preIsTokened = false;
167: }
168:
169: break;
170: } else {
171: return null;
172: }
173: } else {
174: //get current character
175: c = ioBuffer[bufferIndex++];
176:
177: //get the UnicodeBlock of the current character
178: ub = Character.UnicodeBlock.of(c);
179: }
180:
181: //if the current character is ASCII or Extend ASCII
182: if ((ub == Character.UnicodeBlock.BASIC_LATIN)
183: || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)) {
184: if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
185: /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
186: int i = (int) c;
187: i = i - 65248;
188: c = (char) i;
189: }
190:
191: // if the current character is a letter or "_" "+" "#"
192: if (Character.isLetterOrDigit(c)
193: || ((c == '_') || (c == '+') || (c == '#'))) {
194: if (length == 0) {
195: // "javaC1C2C3C4linux" <br>
196: // ^--: the current character begin to token the ASCII
197: // letter
198: start = offset - 1;
199: } else if (tokenType == "double") {
200: // "javaC1C2C3C4linux" <br>
201: // ^--: the previous non-ASCII
202: // : the current character
203: offset--;
204: bufferIndex--;
205: tokenType = "single";
206:
207: if (preIsTokened == true) {
208: // there is only one non-ASCII has been stored
209: length = 0;
210: preIsTokened = false;
211:
212: break;
213: } else {
214: break;
215: }
216: }
217:
218: // store the LowerCase(c) in the buffer
219: buffer[length++] = Character.toLowerCase(c);
220: tokenType = "single";
221:
222: // break the procedure if buffer overflowed!
223: if (length == MAX_WORD_LEN) {
224: break;
225: }
226: } else if (length > 0) {
227: if (preIsTokened == true) {
228: length = 0;
229: preIsTokened = false;
230: } else {
231: break;
232: }
233: }
234: } else {
235: // non-ASCII letter, eg."C1C2C3C4"
236: if (Character.isLetter(c)) {
237: if (length == 0) {
238: start = offset - 1;
239: buffer[length++] = c;
240: tokenType = "double";
241: } else {
242: if (tokenType == "single") {
243: offset--;
244: bufferIndex--;
245:
246: //return the previous ASCII characters
247: break;
248: } else {
249: buffer[length++] = c;
250: tokenType = "double";
251:
252: if (length == 2) {
253: offset--;
254: bufferIndex--;
255: preIsTokened = true;
256:
257: break;
258: }
259: }
260: }
261: } else if (length > 0) {
262: if (preIsTokened == true) {
263: // empty the buffer
264: length = 0;
265: preIsTokened = false;
266: } else {
267: break;
268: }
269: }
270: }
271: }
272:
273: return new Token(new String(buffer, 0, length), start, start
274: + length, tokenType);
275: }
276: }
|