001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.util.HashSet;
022: import java.util.Iterator;
023:
024: import org.apache.lucene.index.IndexReader;
025: import org.apache.lucene.index.Term;
026: import org.apache.lucene.search.BooleanClause;
027: import org.apache.lucene.search.BooleanQuery;
028: import org.apache.lucene.search.FilteredQuery;
029: import org.apache.lucene.search.Query;
030:
031: /**
032: * Utility class used to extract the terms used in a query, plus any weights.
033: * This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes
034: * so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of
035: * expanded terms.
036: *
037: */
038: public final class QueryTermExtractor {
039:
040: /**
041: * Extracts all terms texts of a given Query into an array of WeightedTerms
042: *
043: * @param query Query to extract term texts from
044: * @return an array of the terms used in a query, plus their weights.
045: */
046: public static final WeightedTerm[] getTerms(Query query) {
047: return getTerms(query, false);
048: }
049:
050: /**
051: * Extracts all terms texts of a given Query into an array of WeightedTerms
052: *
053: * @param query Query to extract term texts from
054: * @param reader used to compute IDF which can be used to a) score selected fragments better
055: * b) use graded highlights eg chaning intensity of font color
056: * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
057: * @return an array of the terms used in a query, plus their weights.
058: */
059: public static final WeightedTerm[] getIdfWeightedTerms(Query query,
060: IndexReader reader, String fieldName) {
061: WeightedTerm[] terms = getTerms(query, false, fieldName);
062: int totalNumDocs = reader.numDocs();
063: for (int i = 0; i < terms.length; i++) {
064: try {
065: int docFreq = reader.docFreq(new Term(fieldName,
066: terms[i].term));
067: //IDF algorithm taken from DefaultSimilarity class
068: float idf = (float) (Math.log((float) totalNumDocs
069: / (double) (docFreq + 1)) + 1.0);
070: terms[i].weight *= idf;
071: } catch (IOException e) {
072: //ignore
073: }
074: }
075: return terms;
076: }
077:
078: /**
079: * Extracts all terms texts of a given Query into an array of WeightedTerms
080: *
081: * @param query Query to extract term texts from
082: * @param prohibited <code>true</code> to extract "prohibited" terms, too
083: * @param fieldName The fieldName used to filter query terms
084: * @return an array of the terms used in a query, plus their weights.
085: */
086: public static final WeightedTerm[] getTerms(Query query,
087: boolean prohibited, String fieldName) {
088: HashSet terms = new HashSet();
089: if (fieldName != null) {
090: fieldName = fieldName.intern();
091: }
092: getTerms(query, terms, prohibited, fieldName);
093: return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]);
094: }
095:
096: /**
097: * Extracts all terms texts of a given Query into an array of WeightedTerms
098: *
099: * @param query Query to extract term texts from
100: * @param prohibited <code>true</code> to extract "prohibited" terms, too
101: * @return an array of the terms used in a query, plus their weights.
102: */
103: public static final WeightedTerm[] getTerms(Query query,
104: boolean prohibited) {
105: return getTerms(query, prohibited, null);
106: }
107:
108: //fieldname MUST be interned prior to this call
109: private static final void getTerms(Query query, HashSet terms,
110: boolean prohibited, String fieldName) {
111: try {
112: if (query instanceof BooleanQuery)
113: getTermsFromBooleanQuery((BooleanQuery) query, terms,
114: prohibited, fieldName);
115: else if (query instanceof FilteredQuery)
116: getTermsFromFilteredQuery((FilteredQuery) query, terms,
117: prohibited, fieldName);
118: else {
119: HashSet nonWeightedTerms = new HashSet();
120: query.extractTerms(nonWeightedTerms);
121: for (Iterator iter = nonWeightedTerms.iterator(); iter
122: .hasNext();) {
123: Term term = (Term) iter.next();
124: if ((fieldName == null)
125: || (term.field() == fieldName)) {
126: terms.add(new WeightedTerm(query.getBoost(),
127: term.text()));
128: }
129: }
130: }
131: } catch (UnsupportedOperationException ignore) {
132: //this is non-fatal for our purposes
133: }
134: }
135:
136: /**
137: * extractTerms is currently the only query-independent means of introspecting queries but it only reveals
138: * a list of terms for that query - not the boosts each individual term in that query may or may not have.
139: * "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held
140: * in each child element.
141: * Some discussion around this topic here:
142: * http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208
143: * Unfortunately there seemed to be limited interest in requiring all Query objects to implement
144: * something common which would allow access to child queries so what follows here are query-specific
145: * implementations for accessing embedded query elements.
146: */
147: private static final void getTermsFromBooleanQuery(
148: BooleanQuery query, HashSet terms, boolean prohibited,
149: String fieldName) {
150: BooleanClause[] queryClauses = query.getClauses();
151: for (int i = 0; i < queryClauses.length; i++) {
152: if (prohibited
153: || queryClauses[i].getOccur() != BooleanClause.Occur.MUST_NOT)
154: getTerms(queryClauses[i].getQuery(), terms, prohibited,
155: fieldName);
156: }
157: }
158:
159: private static void getTermsFromFilteredQuery(FilteredQuery query,
160: HashSet terms, boolean prohibited, String fieldName) {
161: getTerms(query.getQuery(), terms, prohibited, fieldName);
162: }
163:
164: }
|