001: package org.apache.lucene.index.memory;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021:
022: import org.apache.lucene.analysis.Token;
023: import org.apache.lucene.analysis.TokenFilter;
024: import org.apache.lucene.analysis.TokenStream;
025:
026: /**
027: * Injects additional tokens for synonyms of token terms fetched from the
028: * underlying child stream; the child stream must deliver lowercase tokens
029: * for synonyms to be found.
030: *
031: * @author whoschek.AT.lbl.DOT.gov
032: */
033: public class SynonymTokenFilter extends TokenFilter {
034:
035: /** The Token.type used to indicate a synonym to higher level filters. */
036: public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
037:
038: private final SynonymMap synonyms;
039: private final int maxSynonyms;
040:
041: private String[] stack = null;
042: private int index = 0;
043: private Token current = null;
044: private int todo = 0;
045:
046: /**
047: * Creates an instance for the given underlying stream and synonym table.
048: *
049: * @param input
050: * the underlying child token stream
051: * @param synonyms
052: * the map used to extract synonyms for terms
053: * @param maxSynonyms
054: * the maximum number of synonym tokens to return per underlying
055: * token word (a value of Integer.MAX_VALUE indicates unlimited)
056: */
057: public SynonymTokenFilter(TokenStream input, SynonymMap synonyms,
058: int maxSynonyms) {
059: super (input);
060: if (input == null)
061: throw new IllegalArgumentException("input must not be null");
062: if (synonyms == null)
063: throw new IllegalArgumentException(
064: "synonyms must not be null");
065: if (maxSynonyms < 0)
066: throw new IllegalArgumentException(
067: "maxSynonyms must not be negative");
068:
069: this .synonyms = synonyms;
070: this .maxSynonyms = maxSynonyms;
071: }
072:
073: /** Returns the next token in the stream, or null at EOS. */
074: public Token next() throws IOException {
075: Token token;
076: while (todo > 0 && index < stack.length) { // pop from stack
077: token = createToken(stack[index++], current);
078: if (token != null) {
079: todo--;
080: return token;
081: }
082: }
083:
084: token = input.next();
085: if (token == null)
086: return null; // EOS; iterator exhausted
087:
088: stack = synonyms.getSynonyms(token.termText()); // push onto stack
089: if (stack.length > maxSynonyms)
090: randomize(stack);
091: index = 0;
092: current = token;
093: todo = maxSynonyms;
094: return token;
095: }
096:
097: /**
098: * Creates and returns a token for the given synonym of the current input
099: * token; Override for custom (stateless or stateful) behaviour, if desired.
100: *
101: * @param synonym
102: * a synonym for the current token's term
103: * @param current
104: * the current token from the underlying child stream
105: * @return a new token, or null to indicate that the given synonym should be
106: * ignored
107: */
108: protected Token createToken(String synonym, Token current) {
109: Token token = new Token(synonym, current.startOffset(), current
110: .endOffset(), SYNONYM_TOKEN_TYPE);
111: token.setPositionIncrement(0);
112: return token;
113: }
114:
115: /**
116: * Randomize synonyms to later sample a subset. Uses constant random seed
117: * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
118: * number generator with medium statistical quality (multiplicative
119: * congruential method), producing integers in the range [Integer.MIN_VALUE,
120: * Integer.MAX_VALUE].
121: */
122: private static void randomize(Object[] arr) {
123: int seed = 1234567; // constant
124: int randomState = 4 * seed + 1;
125: // Random random = new Random(seed); // unnecessary overhead
126: int len = arr.length;
127: for (int i = 0; i < len - 1; i++) {
128: randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
129: int r = randomState % (len - i);
130: if (r < 0)
131: r = -r; // e.g. -9 % 2 == -1
132: // int r = random.nextInt(len-i);
133:
134: // swap arr[i, i+r]
135: Object tmp = arr[i];
136: arr[i] = arr[i + r];
137: arr[i + r] = tmp;
138: }
139: }
140:
141: }
|