001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021:
022: import org.apache.lucene.index.*;
023:
024: /** Expert: Scoring functionality for phrase queries.
025: * <br>A document is considered matching if it contains the phrase-query terms
026: * at "valid" positons. What "valid positions" are
027: * depends on the type of the phrase query: for an exact phrase query terms are required
028: * to appear in adjacent locations, while for a sloppy phrase query some distance between
029: * the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes
030: * is invoked for each document containing all the phrase query terms, in order to
031: * compute the frequency of the phrase query in that document. A non zero frequency
032: * means a match.
033: */
034: abstract class PhraseScorer extends Scorer {
035: private Weight weight;
036: protected byte[] norms;
037: protected float value;
038:
039: private boolean firstTime = true;
040: private boolean more = true;
041: protected PhraseQueue pq;
042: protected PhrasePositions first, last;
043:
044: private float freq; //prhase frequency in current doc as computed by phraseFreq().
045:
046: PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets,
047: Similarity similarity, byte[] norms) {
048: super (similarity);
049: this .norms = norms;
050: this .weight = weight;
051: this .value = weight.getValue();
052:
053: // convert tps to a list of phrase positions.
054: // note: phrase-position differs from term-position in that its position
055: // reflects the phrase offset: pp.pos = tp.pos - offset.
056: // this allows to easily identify a matching (exact) phrase
057: // when all PhrasePositions have exactly the same position.
058: for (int i = 0; i < tps.length; i++) {
059: PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]);
060: if (last != null) { // add next to end of list
061: last.next = pp;
062: } else
063: first = pp;
064: last = pp;
065: }
066:
067: pq = new PhraseQueue(tps.length); // construct empty pq
068:
069: }
070:
071: public int doc() {
072: return first.doc;
073: }
074:
075: public boolean next() throws IOException {
076: if (firstTime) {
077: init();
078: firstTime = false;
079: } else if (more) {
080: more = last.next(); // trigger further scanning
081: }
082: return doNext();
083: }
084:
085: // next without initial increment
086: private boolean doNext() throws IOException {
087: while (more) {
088: while (more && first.doc < last.doc) { // find doc w/ all the terms
089: more = first.skipTo(last.doc); // skip first upto last
090: firstToLast(); // and move it to the end
091: }
092:
093: if (more) {
094: // found a doc with all of the terms
095: freq = phraseFreq(); // check for phrase
096: if (freq == 0.0f) // no match
097: more = last.next(); // trigger further scanning
098: else
099: return true; // found a match
100: }
101: }
102: return false; // no more matches
103: }
104:
105: public float score() throws IOException {
106: //System.out.println("scoring " + first.doc);
107: float raw = getSimilarity().tf(freq) * value; // raw score
108: return raw * Similarity.decodeNorm(norms[first.doc]); // normalize
109: }
110:
111: public boolean skipTo(int target) throws IOException {
112: firstTime = false;
113: for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
114: more = pp.skipTo(target);
115: }
116: if (more)
117: sort(); // re-sort
118: return doNext();
119: }
120:
121: /**
122: * For a document containing all the phrase query terms, compute the
123: * frequency of the phrase in that document.
124: * A non zero frequency means a match.
125: * <br>Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations.
126: * @return frequency of the phrase in current doc, 0 if not found.
127: */
128: protected abstract float phraseFreq() throws IOException;
129:
130: private void init() throws IOException {
131: for (PhrasePositions pp = first; more && pp != null; pp = pp.next)
132: more = pp.next();
133: if (more)
134: sort();
135: }
136:
137: private void sort() {
138: pq.clear();
139: for (PhrasePositions pp = first; pp != null; pp = pp.next)
140: pq.put(pp);
141: pqToList();
142: }
143:
144: protected final void pqToList() {
145: last = first = null;
146: while (pq.top() != null) {
147: PhrasePositions pp = (PhrasePositions) pq.pop();
148: if (last != null) { // add next to end of list
149: last.next = pp;
150: } else
151: first = pp;
152: last = pp;
153: pp.next = null;
154: }
155: }
156:
157: protected final void firstToLast() {
158: last.next = first; // move first to end of list
159: last = first;
160: first = first.next;
161: last.next = null;
162: }
163:
164: public Explanation explain(final int doc) throws IOException {
165: Explanation tfExplanation = new Explanation();
166:
167: while (next() && doc() < doc) {
168: }
169:
170: float phraseFreq = (doc() == doc) ? freq : 0.0f;
171: tfExplanation.setValue(getSimilarity().tf(phraseFreq));
172: tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq
173: + ")");
174:
175: return tfExplanation;
176: }
177:
178: public String toString() {
179: return "scorer(" + weight + ")";
180: }
181:
182: }
|