001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Copyright 2002-2004 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import java.util.HashMap;
020: import java.util.HashSet;
021:
022: import org.apache.lucene.analysis.Token;
023: import org.apache.lucene.index.IndexReader;
024: import org.apache.lucene.search.Query;
025:
026: /**
027: * {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
028: * This class uses the {@link QueryTermExtractor} class to process determine the query terms and
029: * their boosts to be used.
030: * @author mark@searcharea.co.uk
031: */
032: //TODO: provide option to boost score of fragments near beginning of document
033: // based on fragment.getFragNum()
034: public class QueryScorer implements Scorer {
035: TextFragment currentTextFragment = null;
036: HashSet uniqueTermsInFragment;
037: float totalScore = 0;
038: float maxTermWeight = 0;
039: private HashMap termsToFind;
040:
041: /**
042: *
043: * @param query a Lucene query (ideally rewritten using query.rewrite
044: * before being passed to this class and the searcher)
045: */
046: public QueryScorer(Query query) {
047: this (QueryTermExtractor.getTerms(query));
048: }
049:
050: /**
051: *
052: * @param query a Lucene query (ideally rewritten using query.rewrite
053: * before being passed to this class and the searcher)
054: * @param fieldName the Field name which is used to match Query terms
055: */
056: public QueryScorer(Query query, String fieldName) {
057: this (QueryTermExtractor.getTerms(query, false, fieldName));
058: }
059:
060: /**
061: *
062: * @param query a Lucene query (ideally rewritten using query.rewrite
063: * before being passed to this class and the searcher)
064: * @param reader used to compute IDF which can be used to a) score selected fragments better
065: * b) use graded highlights eg set font color intensity
066: * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
067: */
068: public QueryScorer(Query query, IndexReader reader, String fieldName) {
069: this (QueryTermExtractor.getIdfWeightedTerms(query, reader,
070: fieldName));
071: }
072:
073: public QueryScorer(WeightedTerm[] weightedTerms) {
074: termsToFind = new HashMap();
075: for (int i = 0; i < weightedTerms.length; i++) {
076: WeightedTerm existingTerm = (WeightedTerm) termsToFind
077: .get(weightedTerms[i].term);
078: if ((existingTerm == null)
079: || (existingTerm.weight < weightedTerms[i].weight)) {
080: //if a term is defined more than once, always use the highest scoring weight
081: termsToFind
082: .put(weightedTerms[i].term, weightedTerms[i]);
083: maxTermWeight = Math.max(maxTermWeight,
084: weightedTerms[i].getWeight());
085: }
086: }
087: }
088:
089: /* (non-Javadoc)
090: * @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
091: */
092: public void startFragment(TextFragment newFragment) {
093: uniqueTermsInFragment = new HashSet();
094: currentTextFragment = newFragment;
095: totalScore = 0;
096:
097: }
098:
099: /* (non-Javadoc)
100: * @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
101: */
102: public float getTokenScore(Token token) {
103: String termText = token.termText();
104:
105: WeightedTerm queryTerm = (WeightedTerm) termsToFind
106: .get(termText);
107: if (queryTerm == null) {
108: //not a query term - return
109: return 0;
110: }
111: //found a query term - is it unique in this doc?
112: if (!uniqueTermsInFragment.contains(termText)) {
113: totalScore += queryTerm.getWeight();
114: uniqueTermsInFragment.add(termText);
115: }
116: return queryTerm.getWeight();
117: }
118:
119: /* (non-Javadoc)
120: * @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
121: */
122: public float getFragmentScore() {
123: return totalScore;
124: }
125:
126: /* (non-Javadoc)
127: * @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
128: */
129: public void allFragmentsProcessed() {
130: //this class has no special operations to perform at end of processing
131: }
132:
133: /**
134: *
135: * @return The highest weighted term (useful for passing to GradientFormatter to set
136: * top end of coloring scale.
137: */
138: public float getMaxTermWeight() {
139: return maxTermWeight;
140: }
141: }
|