001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Copyright 2002-2004 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import java.io.IOException;
020: import java.util.Collection;
021: import java.util.HashSet;
022: import java.util.Iterator;
023:
024: import org.apache.lucene.index.IndexReader;
025: import org.apache.lucene.index.Term;
026: import org.apache.lucene.search.BooleanClause;
027: import org.apache.lucene.search.BooleanQuery;
028: import org.apache.lucene.search.PhraseQuery;
029: import org.apache.lucene.search.Query;
030: import org.apache.lucene.search.TermQuery;
031: import org.apache.lucene.search.spans.SpanNearQuery;
032:
033: /**
034: * Utility class used to extract the terms used in a query, plus any weights.
035: * This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery
036: * classes so the caller must pass a rewritten query (see Query.rewrite) to
037: * obtain a list of expanded terms.
038: */
039: public final class QueryTermExtractor {
040:
041: /**
042: * Extracts all terms texts of a given Query into an array of WeightedTerms
043: *
044: * @param query
045: * Query to extract term texts from
046: * @return an array of the terms used in a query, plus their weights.
047: */
048: public static final WeightedTerm[] getTerms(Query query) {
049: return getTerms(query, false);
050: }
051:
052: /**
053: * Extracts all terms texts of a given Query into an array of WeightedTerms
054: *
055: * @param query
056: * Query to extract term texts from
057: * @param reader
058: * used to compute IDF which can be used to a) score selected
059: * fragments better b) use graded highlights eg chaning intensity of
060: * font color
061: * @param fieldName
062: * the field on which Inverse Document Frequency (IDF) calculations
063: * are based
064: * @return an array of the terms used in a query, plus their weights.
065: */
066: public static final WeightedTerm[] getIdfWeightedTerms(Query query,
067: IndexReader reader, String fieldName) {
068: WeightedTerm[] terms = getTerms(query, false, fieldName);
069: int totalNumDocs = reader.numDocs();
070: for (int i = 0; i < terms.length; i++) {
071: try {
072: int docFreq = reader.docFreq(new Term(fieldName,
073: terms[i].term));
074: // IDF algorithm taken from DefaultSimilarity class
075: float idf = (float) (Math.log((float) totalNumDocs
076: / (double) (docFreq + 1)) + 1.0);
077: terms[i].weight *= idf;
078: } catch (IOException e) {
079: // ignore
080: }
081: }
082: return terms;
083: }
084:
085: /**
086: * Extracts all terms texts of a given Query into an array of WeightedTerms
087: *
088: * @param query
089: * Query to extract term texts from
090: * @param prohibited
091: * <code>true</code> to extract "prohibited" terms, too
092: * @param fieldName
093: * The fieldName used to filter query terms
094: * @return an array of the terms used in a query, plus their weights.
095: */
096: public static final WeightedTerm[] getTerms(Query query,
097: boolean prohibited, String fieldName) {
098: HashSet terms = new HashSet();
099: if (fieldName != null) {
100: fieldName = fieldName.intern();
101: }
102: getTerms(query, terms, prohibited, fieldName);
103: return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]);
104: }
105:
106: /**
107: * Extracts all terms texts of a given Query into an array of WeightedTerms
108: *
109: * @param query
110: * Query to extract term texts from
111: * @param prohibited
112: * <code>true</code> to extract "prohibited" terms, too
113: * @return an array of the terms used in a query, plus their weights.
114: */
115: public static final WeightedTerm[] getTerms(Query query,
116: boolean prohibited) {
117: return getTerms(query, prohibited, null);
118: }
119:
120: // fieldname MUST be interned prior to this call
121: private static final void getTerms(Query query, HashSet terms,
122: boolean prohibited, String fieldName) {
123: if (query instanceof BooleanQuery)
124: getTermsFromBooleanQuery((BooleanQuery) query, terms,
125: prohibited, fieldName);
126: else if (query instanceof PhraseQuery)
127: getTermsFromPhraseQuery((PhraseQuery) query, terms,
128: fieldName);
129: else if (query instanceof TermQuery)
130: getTermsFromTermQuery((TermQuery) query, terms, fieldName);
131: else if (query instanceof SpanNearQuery)
132: getTermsFromSpanNearQuery((SpanNearQuery) query, terms,
133: fieldName);
134: }
135:
136: private static final void getTermsFromBooleanQuery(
137: BooleanQuery query, HashSet terms, boolean prohibited,
138: String fieldName) {
139: BooleanClause[] queryClauses = query.getClauses();
140: int i;
141:
142: for (i = 0; i < queryClauses.length; i++) {
143: // Pre Lucene 2.0 code
144: //if (prohibited || !queryClauses[i].prohibited)
145: // getTerms(queryClauses[i].query, terms, prohibited, fieldName);
146: // Lucene 2.0 ready code
147: if (prohibited
148: || queryClauses[i].getOccur() != BooleanClause.Occur.MUST_NOT)
149: getTerms(queryClauses[i].getQuery(), terms, prohibited,
150: fieldName);
151: }
152: }
153:
154: private static final void getTermsFromPhraseQuery(
155: PhraseQuery query, HashSet terms, String fieldName) {
156: Term[] queryTerms = query.getTerms();
157: int i;
158:
159: for (i = 0; i < queryTerms.length; i++) {
160: if ((fieldName == null)
161: || (queryTerms[i].field() == fieldName)) {
162: terms.add(new WeightedTerm(query.getBoost(),
163: queryTerms[i].text()));
164: }
165: }
166: }
167:
168: private static final void getTermsFromTermQuery(TermQuery query,
169: HashSet terms, String fieldName) {
170: if ((fieldName == null)
171: || (query.getTerm().field() == fieldName)) {
172: terms.add(new WeightedTerm(query.getBoost(), query
173: .getTerm().text()));
174: }
175: }
176:
177: private static final void getTermsFromSpanNearQuery(
178: SpanNearQuery query, HashSet terms, String fieldName) {
179:
180: Collection queryTerms = query.getTerms();
181:
182: for (Iterator iterator = queryTerms.iterator(); iterator
183: .hasNext();) {
184:
185: // break it out for debugging.
186:
187: Term term = (Term) iterator.next();
188:
189: String text = term.text();
190:
191: if ((fieldName == null) || (term.field() == fieldName)) {
192: terms.add(new WeightedTerm(query.getBoost(), text));
193: }
194: }
195:
196: }
197:
198: }
|