01: /*
02: * Original from: lucene source code package
03: * Changes as described in http://www.geocrawler.com/archives/3/2624/2000/11/0/4746798/
04: */
05:
06: package vqwiki.utils.lucene;
07:
08: /* ====================================================================
09: * The Apache Software License, Version 1.1
10: *
11: * Copyright (c) 2001 The Apache Software Foundation. All rights
12: * reserved.
13: *
14: * Redistribution and use in source and binary forms, with or without
15: * modification, are permitted provided that the following conditions
16: * are met:
17: *
18: * 1. Redistributions of source code must retain the above copyright
19: * notice, this list of conditions and the following disclaimer.
20: *
21: * 2. Redistributions in binary form must reproduce the above copyright
22: * notice, this list of conditions and the following disclaimer in
23: * the documentation and/or other materials provided with the
24: * distribution.
25: *
26: * 3. The end-user documentation included with the redistribution,
27: * if any, must include the following acknowledgment:
28: * "This product includes software developed by the
29: * Apache Software Foundation (http://www.apache.org/)."
30: * Alternately, this acknowledgment may appear in the software itself,
31: * if and wherever such third-party acknowledgments normally appear.
32: *
33: * 4. The names "Apache" and "Apache Software Foundation" and
34: * "Apache Lucene" must not be used to endorse or promote products
35: * derived from this software without prior written permission. For
36: * written permission, please contact apache@apache.org.
37: *
38: * 5. Products derived from this software may not be called "Apache",
39: * "Apache Lucene", nor may "Apache" appear in their name, without
40: * prior written permission of the Apache Software Foundation.
41: *
42: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
43: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
44: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
46: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
49: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
50: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
51: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
52: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53: * SUCH DAMAGE.
54: * ====================================================================
55: *
56: * This software consists of voluntary contributions made by many
57: * individuals on behalf of the Apache Software Foundation. For more
58: * information on the Apache Software Foundation, please see
59: * <http://www.apache.org/>.
60: */
61:
62: import java.io.Reader;
63:
64: import org.apache.lucene.analysis.LetterTokenizer;
65:
66: /**
67: * LowerCaseKeepNumbersTokenizer performs the function of LetterTokenizer
68: * and LowerCaseFilter together. It divides text at non-letters and converts
69: * them to lower case. While it is functionally equivalent to the combination
70: * of LetterTokenizer and LowerCaseFilter, there is a performance advantage
71: * to doing the two tasks at once, hence this (redundant) implementation.
72: * <P>
73: * Note: this does a decent job for most European languages, but does a terrible
74: * job for some Asian languages, where words are not separated by spaces.
75: */
76: public final class LowerCaseKeepNumbersTokenizer extends
77: LetterTokenizer {
78: /** Construct a new LowerCaseKeepNumbersTokenizer. */
79: public LowerCaseKeepNumbersTokenizer(Reader in) {
80: super (in);
81: }
82:
83: /** Collects only characters which satisfy
84: * {@link Character#isLetter(char)}.*/
85: protected char normalize(char c) {
86: return Character.toLowerCase(c);
87: }
88:
89: /** Collects only characters which satisfy
90: * {@link Character#isLetter(char)}.*/
91: protected boolean isTokenChar(char c) {
92: return Character.isLetterOrDigit(c);
93: }
94: }
|