001: /*
002: * Copyright 2004-2006 the original author or authors.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.apache.lucene.search.spell;
018:
019: import java.io.IOException;
020: import java.util.Iterator;
021:
022: import org.apache.lucene.index.IndexReader;
023: import org.apache.lucene.index.Term;
024: import org.apache.lucene.index.TermEnum;
025:
026: /**
027: * HighFrequencyDictionary: terms taken from the given field
028: * of a Lucene index, which appear in a number of documents
029: * above a given threshold.
030: *
031: * When using IndexReader.terms(Term) the code must not call next() on TermEnum
032: * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
033: *
034: * Threshold is a value in [0..1] representing the minimum
035: * number of documents (of the total) where a term should appear.
036: *
037: * Based on LuceneDictionary.
038: */
039: public class HighFrequencyDictionary implements Dictionary {
040: private IndexReader reader;
041: private String field;
042: private float thresh;
043:
044: public HighFrequencyDictionary(IndexReader reader, String field,
045: float thresh) {
046: this .reader = reader;
047: this .field = field.intern();
048: this .thresh = thresh;
049: }
050:
051: public final Iterator getWordsIterator() {
052: return new HighFrequencyIterator();
053: }
054:
055: final class HighFrequencyIterator implements Iterator {
056: private TermEnum termEnum;
057: private Term actualTerm;
058: private boolean hasNextCalled;
059: private int minNumDocs;
060:
061: HighFrequencyIterator() {
062: try {
063: termEnum = reader.terms(new Term(field, ""));
064: minNumDocs = (int) (thresh * (float) reader.numDocs());
065: } catch (IOException e) {
066: throw new RuntimeException(e);
067: }
068: }
069:
070: private boolean isFrequent(Term term) {
071: try {
072: return reader.docFreq(term) >= minNumDocs;
073: } catch (IOException e) {
074: throw new RuntimeException(e);
075: }
076: }
077:
078: public Object next() {
079: if (!hasNextCalled) {
080: hasNext();
081: }
082: hasNextCalled = false;
083:
084: try {
085: termEnum.next();
086: } catch (IOException e) {
087: throw new RuntimeException(e);
088: }
089:
090: return (actualTerm != null) ? actualTerm.text() : null;
091: }
092:
093: public boolean hasNext() {
094: if (hasNextCalled) {
095: return actualTerm != null;
096: }
097: hasNextCalled = true;
098:
099: do {
100: actualTerm = termEnum.term();
101:
102: // if there are no words return false
103: if (actualTerm == null) {
104: return false;
105: }
106:
107: String currentField = actualTerm.field();
108:
109: // if the next word doesn't have the same field return false
110: if (currentField != field) {
111: actualTerm = null;
112: return false;
113: }
114:
115: // got a valid term, does it pass the threshold?
116: if (isFrequent(actualTerm)) {
117: return true;
118: }
119:
120: // term not up to threshold
121: try {
122: termEnum.next();
123: } catch (IOException e) {
124: throw new RuntimeException(e);
125: }
126:
127: } while (true);
128: }
129:
130: public void remove() {
131: throw new UnsupportedOperationException();
132: }
133: }
134: }
|