001: package org.apache.lucene.analysis.ngram;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Token;
021: import org.apache.lucene.analysis.Tokenizer;
022:
023: import java.io.IOException;
024: import java.io.Reader;
025:
026: /**
027: * Tokenizes the input from an edge into n-grams of given size(s).
028: * @author Otis Gospodnetic
029: * @author Adam Hiatt
030: */
031: public class EdgeNGramTokenizer extends Tokenizer {
032: public static final Side DEFAULT_SIDE = Side.FRONT;
033: public static final int DEFAULT_MAX_GRAM_SIZE = 1;
034: public static final int DEFAULT_MIN_GRAM_SIZE = 1;
035:
036: // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
037: /** Specifies which side of the input the n-gram should be generated from */
038: public static class Side {
039: private String label;
040:
041: /** Get the n-gram from the front of the input */
042: public static Side FRONT = new Side("front");
043:
044: /** Get the n-gram from the end of the input */
045: public static Side BACK = new Side("back");
046:
047: // Private ctor
048: private Side(String label) {
049: this .label = label;
050: }
051:
052: public String getLabel() {
053: return label;
054: }
055:
056: // Get the appropriate Side from a string
057: public static Side getSide(String sideName) {
058: if (FRONT.getLabel().equals(sideName)) {
059: return FRONT;
060: } else if (BACK.getLabel().equals(sideName)) {
061: return BACK;
062: }
063: return null;
064: }
065: }
066:
067: private int minGram;
068: private int maxGram;
069: private int gramSize;
070: private Side side;
071: private boolean started = false;
072: private int inLen;
073: private String inStr;
074:
075: /**
076: * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
077: *
078: * @param input Reader holding the input to be tokenized
079: * @param side the {@link Side} from which to chop off an n-gram
080: * @param minGram the smallest n-gram to generate
081: * @param maxGram the largest n-gram to generate
082: */
083: public EdgeNGramTokenizer(Reader input, Side side, int minGram,
084: int maxGram) {
085: super (input);
086:
087: if (side == null) {
088: throw new IllegalArgumentException(
089: "sideLabel must be either front or back");
090: }
091:
092: if (minGram < 1) {
093: throw new IllegalArgumentException(
094: "minGram must be greater than zero");
095: }
096:
097: if (minGram > maxGram) {
098: throw new IllegalArgumentException(
099: "minGram must not be greater than maxGram");
100: }
101:
102: this .minGram = minGram;
103: this .maxGram = maxGram;
104: this .side = side;
105: }
106:
107: /**
108: * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
109: *
110: * @param input Reader holding the input to be tokenized
111: * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
112: * @param minGram the smallest n-gram to generate
113: * @param maxGram the largest n-gram to generate
114: */
115: public EdgeNGramTokenizer(Reader input, String sideLabel,
116: int minGram, int maxGram) {
117: this (input, Side.getSide(sideLabel), minGram, maxGram);
118: }
119:
120: /** Returns the next token in the stream, or null at EOS. */
121: public final Token next() throws IOException {
122: // if we are just starting, read the whole input
123: if (!started) {
124: started = true;
125: char[] chars = new char[1024];
126: input.read(chars);
127: inStr = new String(chars).trim(); // remove any trailing empty strings
128: inLen = inStr.length();
129: gramSize = minGram;
130: }
131:
132: // if the remaining input is too short, we can't generate any n-grams
133: if (gramSize > inLen) {
134: return null;
135: }
136:
137: // if we have hit the end of our n-gram size range, quit
138: if (gramSize > maxGram) {
139: return null;
140: }
141:
142: Token tok;
143: if (side == Side.FRONT) {
144: tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
145: } else {
146: tok = new Token(inStr.substring(inLen - gramSize), inLen
147: - gramSize, inLen);
148: }
149:
150: gramSize++;
151: return tok;
152: }
153: }
|