001: package org.apache.lucene.analysis.ngram;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Token;
021: import org.apache.lucene.analysis.TokenFilter;
022: import org.apache.lucene.analysis.TokenStream;
023:
024: import java.io.IOException;
025: import java.util.LinkedList;
026:
027: /**
028: * Tokenizes the input into n-grams of the given size(s).
029: * @author Otis Gospodnetic
030: */
031: public class NGramTokenFilter extends TokenFilter {
032: public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
033: public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
034:
035: private int minGram, maxGram;
036: private LinkedList ngrams;
037:
038: /**
039: * Creates NGramTokenFilter with given min and max n-grams.
040: * @param input TokenStream holding the input to be tokenized
041: * @param minGram the smallest n-gram to generate
042: * @param maxGram the largest n-gram to generate
043: */
044: public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
045: super (input);
046: if (minGram < 1) {
047: throw new IllegalArgumentException(
048: "minGram must be greater than zero");
049: }
050: if (minGram > maxGram) {
051: throw new IllegalArgumentException(
052: "minGram must not be greater than maxGram");
053: }
054: this .minGram = minGram;
055: this .maxGram = maxGram;
056: this .ngrams = new LinkedList();
057: }
058:
059: /**
060: * Creates NGramTokenFilter with default min and max n-grams.
061: * @param input TokenStream holding the input to be tokenized
062: */
063: public NGramTokenFilter(TokenStream input) {
064: this (input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
065: }
066:
067: /** Returns the next token in the stream, or null at EOS. */
068: public final Token next() throws IOException {
069: if (ngrams.size() > 0) {
070: return (Token) ngrams.removeFirst();
071: }
072:
073: Token token = input.next();
074: if (token == null) {
075: return null;
076: }
077:
078: ngram(token);
079: if (ngrams.size() > 0)
080: return (Token) ngrams.removeFirst();
081: else
082: return null;
083: }
084:
085: private void ngram(Token token) {
086: String inStr = token.termText();
087: int inLen = inStr.length();
088: int gramSize = minGram;
089: while (gramSize <= maxGram) {
090: int pos = 0; // reset to beginning of string
091: while (pos + gramSize <= inLen) { // while there is input
092: String gram = inStr.substring(pos, pos + gramSize);
093: Token tok = new Token(gram, pos, pos + gramSize);
094: // tok.setPositionIncrement(pos);
095: ngrams.add(tok);
096: pos++;
097: }
098: gramSize++; // increase n-gram size
099: }
100: }
101: }
|