001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Copyright 2002-2004 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import java.util.HashMap;
020: import java.util.HashSet;
021:
022: import org.apache.lucene.analysis.Token;
023: import org.apache.lucene.index.IndexReader;
024: import org.apache.lucene.search.Query;
025:
026: /**
027: * {@link Scorer} implementation which scores text fragments by the number of
028: * unique query terms found. This class uses the {@link QueryTermExtractor}
029: * class to process determine the query terms and their boosts to be used.
030: *
031: * @author mark@searcharea.co.uk
032: */
033: // TODO: provide option to boost score of fragments near beginning of document
034: // based on fragment.getFragNum()
035: public class QueryScorer implements Scorer {
036: TextFragment currentTextFragment = null;
037:
038: HashSet uniqueTermsInFragment;
039:
040: float totalScore = 0;
041:
042: float maxTermWeight = 0;
043:
044: private HashMap termsToFind;
045:
046: /**
047: * @param query
048: * a Lucene query (ideally rewritten using query.rewrite before being
049: * passed to this class and the searcher)
050: */
051: public QueryScorer(Query query) {
052: this (QueryTermExtractor.getTerms(query));
053: }
054:
055: /**
056: * @param query
057: * a Lucene query (ideally rewritten using query.rewrite before being
058: * passed to this class and the searcher)
059: * @param fieldName
060: * the Field name which is used to match Query terms
061: */
062: public QueryScorer(Query query, String fieldName) {
063: this (QueryTermExtractor.getTerms(query, false, fieldName));
064: }
065:
066: /**
067: * @param query
068: * a Lucene query (ideally rewritten using query.rewrite before being
069: * passed to this class and the searcher)
070: * @param reader
071: * used to compute IDF which can be used to a) score selected
072: * fragments better b) use graded highlights eg set font color
073: * intensity
074: * @param fieldName
075: * the field on which Inverse Document Frequency (IDF) calculations
076: * are based
077: */
078: public QueryScorer(Query query, IndexReader reader, String fieldName) {
079: this (QueryTermExtractor.getIdfWeightedTerms(query, reader,
080: fieldName));
081: }
082:
083: public QueryScorer(WeightedTerm[] weightedTerms) {
084: termsToFind = new HashMap();
085: for (int i = 0; i < weightedTerms.length; i++) {
086: WeightedTerm existingTerm = (WeightedTerm) termsToFind
087: .get(weightedTerms[i].term);
088: if ((existingTerm == null)
089: || (existingTerm.weight < weightedTerms[i].weight)) {
090: // if a term is defined more than once, always use the highest
091: // scoring weight
092: termsToFind
093: .put(weightedTerms[i].term, weightedTerms[i]);
094: maxTermWeight = Math.max(maxTermWeight,
095: weightedTerms[i].getWeight());
096: }
097: }
098: }
099:
100: /*
101: * (non-Javadoc)
102: *
103: * @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
104: */
105: public void startFragment(TextFragment newFragment) {
106: uniqueTermsInFragment = new HashSet();
107: currentTextFragment = newFragment;
108: totalScore = 0;
109:
110: }
111:
112: /*
113: * (non-Javadoc)
114: *
115: * @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
116: */
117: public float getTokenScore(Token token) {
118: String termText = token.termText();
119:
120: WeightedTerm queryTerm = (WeightedTerm) termsToFind
121: .get(termText);
122: if (queryTerm == null) {
123: // not a query term - return
124: return 0;
125: }
126: // found a query term - is it unique in this doc?
127: if (!uniqueTermsInFragment.contains(termText)) {
128: totalScore += queryTerm.getWeight();
129: uniqueTermsInFragment.add(termText);
130: }
131: return queryTerm.getWeight();
132: }
133:
134: /*
135: * (non-Javadoc)
136: *
137: * @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
138: */
139: public float getFragmentScore() {
140: return totalScore;
141: }
142:
143: /*
144: * (non-Javadoc)
145: *
146: * @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
147: */
148: public void allFragmentsProcessed() {
149: // this class has no special operations to perform at end of processing
150: }
151:
152: /**
153: * @return The highest weighted term (useful for passing to
154: * GradientFormatter to set top end of coloring scale.
155: */
156: public float getMaxTermWeight() {
157: return maxTermWeight;
158: }
159: }
|