001: package org.apache.lucene.analysis.cjk;
002:
003: /* ====================================================================
004: * The Apache Software License, Version 1.1
005: *
006: * Copyright (c) 2004 The Apache Software Foundation. All rights
007: * reserved.
008: *
009: * Redistribution and use in source and binary forms, with or without
010: * modification, are permitted provided that the following conditions
011: * are met:
012: *
013: * 1. Redistributions of source code must retain the above copyright
014: * notice, this list of conditions and the following disclaimer.
015: *
016: * 2. Redistributions in binary form must reproduce the above copyright
017: * notice, this list of conditions and the following disclaimer in
018: * the documentation and/or other materials provided with the
019: * distribution.
020: *
021: * 3. The end-user documentation included with the redistribution,
022: * if any, must include the following acknowledgment:
023: * "This product includes software developed by the
024: * Apache Software Foundation (http://www.apache.org/)."
025: * Alternately, this acknowledgment may appear in the software itself,
026: * if and wherever such third-party acknowledgments normally appear.
027: *
028: * 4. The names "Apache" and "Apache Software Foundation" and
029: * "Apache Lucene" must not be used to endorse or promote products
030: * derived from this software without prior written permission. For
031: * written permission, please contact apache@apache.org.
032: *
033: * 5. Products derived from this software may not be called "Apache",
034: * "Apache Lucene", nor may "Apache" appear in their name, without
035: * prior written permission of the Apache Software Foundation.
036: *
037: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
038: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
039: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
040: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
041: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
042: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
043: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
044: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
045: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
046: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
047: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
048: * SUCH DAMAGE.
049: * ====================================================================
050: *
051: * This software consists of voluntary contributions made by many
052: * individuals on behalf of the Apache Software Foundation. For more
053: * information on the Apache Software Foundation, please see
054: * <http://www.apache.org/>.
055: *
056: * $Id: CJKAnalyzer.java,v 1.1 2005/06/02 01:35:59 jfendler Exp $
057: */
058:
059: import org.apache.lucene.analysis.Analyzer;
060: import org.apache.lucene.analysis.StopFilter;
061: import org.apache.lucene.analysis.TokenStream;
062:
063: import java.io.Reader;
064: import java.util.Set;
065:
066: /**
067: * Filters CJKTokenizer with StopFilter.
068: *
069: * @author Che, Dong
070: */
071: public class CJKAnalyzer extends Analyzer {
072: //~ Static fields/initializers ---------------------------------------------
073:
074: /**
075: * An array containing some common English words that are not usually
076: * useful for searching. and some double-byte interpunctions.....
077: */
078: public final static String[] STOP_WORDS = { "a", "and", "are",
079: "as", "at", "be", "but", "by", "for", "if", "in", "into",
080: "is", "it", "no", "not", "of", "on", "or", "s", "such",
081: "t", "that", "the", "their", "then", "there", "these",
082: "they", "this", "to", "was", "will", "with", "", "www" };
083:
084: //~ Instance fields --------------------------------------------------------
085:
086: /**
087: * stop word list
088: */
089: private Set stopTable;
090:
091: //~ Constructors -----------------------------------------------------------
092:
093: /**
094: * Builds an analyzer which removes words in {@link #STOP_WORDS}.
095: */
096: public CJKAnalyzer() {
097: stopTable = StopFilter.makeStopSet(STOP_WORDS);
098: }
099:
100: /**
101: * Builds an analyzer which removes words in the provided array.
102: *
103: * @param stopWords stop word array
104: */
105: public CJKAnalyzer(String[] stopWords) {
106: stopTable = StopFilter.makeStopSet(stopWords);
107: }
108:
109: //~ Methods ----------------------------------------------------------------
110:
111: /**
112: * get token stream from input
113: *
114: * @param fieldName lucene field name
115: * @param reader input reader
116: * @return TokenStream
117: */
118: public final TokenStream tokenStream(String fieldName, Reader reader) {
119: return new StopFilter(new CJKTokenizer(reader), stopTable);
120: }
121: }
|