001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.index.TermPositions;
021:
022: import java.io.IOException;
023: import java.util.Arrays;
024: import java.util.Comparator;
025: import java.util.HashMap;
026:
027: final class SloppyPhraseScorer extends PhraseScorer {
028: private int slop;
029: private PhrasePositions repeats[];
030: private boolean checkedRepeats;
031:
032: SloppyPhraseScorer(Weight weight, TermPositions[] tps,
033: int[] offsets, Similarity similarity, int slop, byte[] norms) {
034: super (weight, tps, offsets, similarity, norms);
035: this .slop = slop;
036: }
037:
038: /**
039: * Score a candidate doc for all slop-valid position-combinations (matches)
040: * encountered while traversing/hopping the PhrasePositions.
041: * <br> The score contribution of a match depends on the distance:
042: * <br> - highest score for distance=0 (exact match).
043: * <br> - score gets lower as distance gets higher.
044: * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
045: * once for "a b" (distance=0), and once for "b a" (distance=2).
046: * <br>Pssibly not all valid combinations are encountered, because for efficiency
047: * we always propagate the least PhrasePosition. This allows to base on
048: * PriorityQueue and move forward faster.
049: * As result, for example, document "a b c b a"
050: * would score differently for queries "a b c"~4 and "c b a"~4, although
051: * they really are equivalent.
052: * Similarly, for doc "a b c b a f g", query "c b"~2
053: * would get same score as "g f"~2, although "c b"~2 could be matched twice.
054: * We may want to fix this in the future (currently not, for performance reasons).
055: */
056: protected final float phraseFreq() throws IOException {
057: int end = initPhrasePositions();
058:
059: float freq = 0.0f;
060: boolean done = (end < 0);
061: while (!done) {
062: PhrasePositions pp = (PhrasePositions) pq.pop();
063: int start = pp.position;
064: int next = ((PhrasePositions) pq.top()).position;
065:
066: boolean tpsDiffer = true;
067: for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) {
068: if (pos <= next && tpsDiffer)
069: start = pos; // advance pp to min window
070: if (!pp.nextPosition()) {
071: done = true; // ran out of a term -- done
072: break;
073: }
074: tpsDiffer = !pp.repeats || termPositionsDiffer(pp);
075: }
076:
077: int matchLength = end - start;
078: if (matchLength <= slop)
079: freq += getSimilarity().sloppyFreq(matchLength); // score match
080:
081: if (pp.position > end)
082: end = pp.position;
083: pq.put(pp); // restore pq
084: }
085:
086: return freq;
087: }
088:
089: /**
090: * Init PhrasePositions in place.
091: * There is a one time initializatin for this scorer:
092: * <br>- Put in repeats[] each pp that has another pp with same position in the doc.
093: * <br>- Also mark each such pp by pp.repeats = true.
094: * <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
095: * In particular, this allows to score queries with no repetiotions with no overhead due to this computation.
096: * <br>- Example 1 - query with no repetitions: "ho my"~2
097: * <br>- Example 2 - query with repetitions: "ho my my"~2
098: * <br>- Example 3 - query with repetitions: "my ho my"~2
099: * <br>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
100: * @return end (max position), or -1 if any term ran out (i.e. done)
101: * @throws IOException
102: */
103: private int initPhrasePositions() throws IOException {
104: int end = 0;
105:
106: // no repeats at all (most common case is also the simplest one)
107: if (checkedRepeats && repeats == null) {
108: // build queue from list
109: pq.clear();
110: for (PhrasePositions pp = first; pp != null; pp = pp.next) {
111: pp.firstPosition();
112: if (pp.position > end)
113: end = pp.position;
114: pq.put(pp); // build pq from list
115: }
116: return end;
117: }
118:
119: // position the pp's
120: for (PhrasePositions pp = first; pp != null; pp = pp.next)
121: pp.firstPosition();
122:
123: // one time initializatin for this scorer
124: if (!checkedRepeats) {
125: checkedRepeats = true;
126: // check for repeats
127: HashMap m = null;
128: for (PhrasePositions pp = first; pp != null; pp = pp.next) {
129: int tpPos = pp.position + pp.offset;
130: for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
131: int tpPos2 = pp2.position + pp2.offset;
132: if (tpPos2 == tpPos) {
133: if (m == null)
134: m = new HashMap();
135: pp.repeats = true;
136: pp2.repeats = true;
137: m.put(pp, null);
138: m.put(pp2, null);
139: }
140: }
141: }
142: if (m != null)
143: repeats = (PhrasePositions[]) m.keySet().toArray(
144: new PhrasePositions[0]);
145: }
146:
147: // with repeats must advance some repeating pp's so they all start with differing tp's
148: if (repeats != null) {
149: // must propagate higher offsets first (otherwise might miss matches).
150: Arrays.sort(repeats, new Comparator() {
151: public int compare(Object x, Object y) {
152: return ((PhrasePositions) y).offset
153: - ((PhrasePositions) x).offset;
154: }
155: });
156: // now advance them
157: for (int i = 0; i < repeats.length; i++) {
158: PhrasePositions pp = repeats[i];
159: while (!termPositionsDiffer(pp)) {
160: if (!pp.nextPosition())
161: return -1; // ran out of a term -- done
162: }
163: }
164: }
165:
166: // build queue from list
167: pq.clear();
168: for (PhrasePositions pp = first; pp != null; pp = pp.next) {
169: if (pp.position > end)
170: end = pp.position;
171: pq.put(pp); // build pq from list
172: }
173:
174: return end;
175: }
176:
177: // disalow two pp's to have the same tp position, so that same word twice
178: // in query would go elswhere in the matched doc
179: private boolean termPositionsDiffer(PhrasePositions pp) {
180: // efficiency note: a more efficient implemention could keep a map between repeating
181: // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
182: // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
183: // However this would complicate code, for a rather rare case, so choice is to compromise here.
184: int tpPos = pp.position + pp.offset;
185: for (int i = 0; i < repeats.length; i++) {
186: PhrasePositions pp2 = repeats[i];
187: if (pp2 == pp)
188: continue;
189: int tpPos2 = pp2.position + pp2.offset;
190: if (tpPos2 == tpPos)
191: return false;
192: }
193: return true;
194: }
195: }
|