001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Copyright 2002-2004 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import java.io.IOException;
020: import java.util.HashSet;
021: import java.util.Iterator;
022:
023: import org.apache.lucene.index.IndexReader;
024: import org.apache.lucene.index.Term;
025: import org.apache.lucene.search.Query;
026:
027: /**
028: * Utility class used to extract the terms used in a query, plus any weights.
029: * This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes
030: * so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of
031: * expanded terms.
032: *
033: */
034: public final class QueryTermExtractor {
035:
036: /**
037: * Extracts all terms texts of a given Query into an array of WeightedTerms
038: *
039: * @param query Query to extract term texts from
040: * @return an array of the terms used in a query, plus their weights.
041: */
042: public static final WeightedTerm[] getTerms(Query query) {
043: return getTerms(query, false);
044: }
045:
046: /**
047: * Extracts all terms texts of a given Query into an array of WeightedTerms
048: *
049: * @param query Query to extract term texts from
050: * @param reader used to compute IDF which can be used to a) score selected fragments better
051: * b) use graded highlights eg chaning intensity of font color
052: * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
053: * @return an array of the terms used in a query, plus their weights.
054: */
055: public static final WeightedTerm[] getIdfWeightedTerms(Query query,
056: IndexReader reader, String fieldName) {
057: WeightedTerm[] terms = getTerms(query, false, fieldName);
058: int totalNumDocs = reader.numDocs();
059: for (int i = 0; i < terms.length; i++) {
060: try {
061: int docFreq = reader.docFreq(new Term(fieldName,
062: terms[i].term));
063: //IDF algorithm taken from DefaultSimilarity class
064: float idf = (float) (Math.log((float) totalNumDocs
065: / (double) (docFreq + 1)) + 1.0);
066: terms[i].weight *= idf;
067: } catch (IOException e) {
068: //ignore
069: }
070: }
071: return terms;
072: }
073:
074: /**
075: * Extracts all terms texts of a given Query into an array of WeightedTerms
076: *
077: * @param query Query to extract term texts from
078: * @param prohibited <code>true</code> to extract "prohibited" terms, too
079: * @param fieldName The fieldName used to filter query terms
080: * @return an array of the terms used in a query, plus their weights.
081: */
082: public static final WeightedTerm[] getTerms(Query query,
083: boolean prohibited, String fieldName) {
084: HashSet terms = new HashSet();
085: if (fieldName != null) {
086: fieldName = fieldName.intern();
087: }
088: getTerms(query, terms, prohibited, fieldName);
089: return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]);
090: }
091:
092: /**
093: * Extracts all terms texts of a given Query into an array of WeightedTerms
094: *
095: * @param query Query to extract term texts from
096: * @param prohibited <code>true</code> to extract "prohibited" terms, too
097: * @return an array of the terms used in a query, plus their weights.
098: */
099: public static final WeightedTerm[] getTerms(Query query,
100: boolean prohibited) {
101: return getTerms(query, prohibited, null);
102: }
103:
104: //fieldname MUST be interned prior to this call
105: private static final void getTerms(Query query, HashSet terms,
106: boolean prohibited, String fieldName) {
107: try {
108: HashSet nonWeightedTerms = new HashSet();
109: query.extractTerms(nonWeightedTerms);
110: for (Iterator iter = nonWeightedTerms.iterator(); iter
111: .hasNext();) {
112: Term term = (Term) iter.next();
113: if ((fieldName == null) || (term.field() == fieldName)) {
114: terms.add(new WeightedTerm(query.getBoost(), term
115: .text()));
116: }
117: }
118: } catch (UnsupportedOperationException ignore) {
119: //this is non-fatal for our purposes
120: }
121: }
122: }
|