001: package org.apache.lucene.search.spans;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021:
022: import java.util.Arrays;
023: import java.util.Comparator;
024:
025: import org.apache.lucene.index.IndexReader;
026:
027: /** A Spans that is formed from the ordered subspans of a SpanNearQuery
028: * where the subspans do not overlap and have a maximum slop between them.
029: * <p>
030: * The formed spans only contains minimum slop matches.<br>
031: * The matching slop is computed from the distance(s) between
032: * the non overlapping matching Spans.<br>
033: * Successive matches are always formed from the successive Spans
034: * of the SpanNearQuery.
035: * <p>
036: * The formed spans may contain overlaps when the slop is at least 1.
037: * For example, when querying using
038: * <pre>t1 t2 t3</pre>
039: * with slop at least 1, the fragment:
040: * <pre>t1 t2 t1 t3 t2 t3</pre>
041: * matches twice:
042: * <pre>t1 t2 .. t3 </pre>
043: * <pre> t1 .. t2 t3</pre>
044: */
045: class NearSpansOrdered implements Spans {
046: private final int allowedSlop;
047: private boolean firstTime = true;
048: private boolean more = false;
049:
050: /** The spans in the same order as the SpanNearQuery */
051: private final Spans[] subSpans;
052:
053: /** Indicates that all subSpans have same doc() */
054: private boolean inSameDoc = false;
055:
056: private int matchDoc = -1;
057: private int matchStart = -1;
058: private int matchEnd = -1;
059:
060: private final Spans[] subSpansByDoc;
061: private final Comparator spanDocComparator = new Comparator() {
062: public int compare(Object o1, Object o2) {
063: return ((Spans) o1).doc() - ((Spans) o2).doc();
064: }
065: };
066:
067: private SpanNearQuery query;
068:
069: public NearSpansOrdered(SpanNearQuery spanNearQuery,
070: IndexReader reader) throws IOException {
071: if (spanNearQuery.getClauses().length < 2) {
072: throw new IllegalArgumentException("Less than 2 clauses: "
073: + spanNearQuery);
074: }
075: allowedSlop = spanNearQuery.getSlop();
076: SpanQuery[] clauses = spanNearQuery.getClauses();
077: subSpans = new Spans[clauses.length];
078: subSpansByDoc = new Spans[clauses.length];
079: for (int i = 0; i < clauses.length; i++) {
080: subSpans[i] = clauses[i].getSpans(reader);
081: subSpansByDoc[i] = subSpans[i]; // used in toSameDoc()
082: }
083: query = spanNearQuery; // kept for toString() only.
084: }
085:
086: // inherit javadocs
087: public int doc() {
088: return matchDoc;
089: }
090:
091: // inherit javadocs
092: public int start() {
093: return matchStart;
094: }
095:
096: // inherit javadocs
097: public int end() {
098: return matchEnd;
099: }
100:
101: // inherit javadocs
102: public boolean next() throws IOException {
103: if (firstTime) {
104: firstTime = false;
105: for (int i = 0; i < subSpans.length; i++) {
106: if (!subSpans[i].next()) {
107: more = false;
108: return false;
109: }
110: }
111: more = true;
112: }
113: return advanceAfterOrdered();
114: }
115:
116: // inherit javadocs
117: public boolean skipTo(int target) throws IOException {
118: if (firstTime) {
119: firstTime = false;
120: for (int i = 0; i < subSpans.length; i++) {
121: if (!subSpans[i].skipTo(target)) {
122: more = false;
123: return false;
124: }
125: }
126: more = true;
127: } else if (more && (subSpans[0].doc() < target)) {
128: if (subSpans[0].skipTo(target)) {
129: inSameDoc = false;
130: } else {
131: more = false;
132: return false;
133: }
134: }
135: return advanceAfterOrdered();
136: }
137:
138: /** Advances the subSpans to just after an ordered match with a minimum slop
139: * that is smaller than the slop allowed by the SpanNearQuery.
140: * @return true iff there is such a match.
141: */
142: private boolean advanceAfterOrdered() throws IOException {
143: while (more && (inSameDoc || toSameDoc())) {
144: if (stretchToOrder() && shrinkToAfterShortestMatch()) {
145: return true;
146: }
147: }
148: return false; // no more matches
149: }
150:
151: /** Advance the subSpans to the same document */
152: private boolean toSameDoc() throws IOException {
153: Arrays.sort(subSpansByDoc, spanDocComparator);
154: int firstIndex = 0;
155: int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc();
156: while (subSpansByDoc[firstIndex].doc() != maxDoc) {
157: if (!subSpansByDoc[firstIndex].skipTo(maxDoc)) {
158: more = false;
159: inSameDoc = false;
160: return false;
161: }
162: maxDoc = subSpansByDoc[firstIndex].doc();
163: if (++firstIndex == subSpansByDoc.length) {
164: firstIndex = 0;
165: }
166: }
167: for (int i = 0; i < subSpansByDoc.length; i++) {
168: assert (subSpansByDoc[i].doc() == maxDoc) : " NearSpansOrdered.toSameDoc() spans "
169: + subSpansByDoc[0]
170: + "\n at doc "
171: + subSpansByDoc[i].doc()
172: + ", but should be at "
173: + maxDoc;
174: }
175: inSameDoc = true;
176: return true;
177: }
178:
179: /** Check whether two Spans in the same document are ordered.
180: * @param spans1
181: * @param spans2
182: * @return true iff spans1 starts before spans2
183: * or the spans start at the same position,
184: * and spans1 ends before spans2.
185: */
186: static final boolean docSpansOrdered(Spans spans1, Spans spans2) {
187: assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc()
188: + " != doc2 " + spans2.doc();
189: int start1 = spans1.start();
190: int start2 = spans2.start();
191: /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */
192: return (start1 == start2) ? (spans1.end() < spans2.end())
193: : (start1 < start2);
194: }
195:
196: /** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans
197: * starts and ends as parameters.
198: */
199: private static final boolean docSpansOrdered(int start1, int end1,
200: int start2, int end2) {
201: return (start1 == start2) ? (end1 < end2) : (start1 < start2);
202: }
203:
204: /** Order the subSpans within the same document by advancing all later spans
205: * after the previous one.
206: */
207: private boolean stretchToOrder() throws IOException {
208: matchDoc = subSpans[0].doc();
209: for (int i = 1; inSameDoc && (i < subSpans.length); i++) {
210: while (!docSpansOrdered(subSpans[i - 1], subSpans[i])) {
211: if (!subSpans[i].next()) {
212: inSameDoc = false;
213: more = false;
214: break;
215: } else if (matchDoc != subSpans[i].doc()) {
216: inSameDoc = false;
217: break;
218: }
219: }
220: }
221: return inSameDoc;
222: }
223:
224: /** The subSpans are ordered in the same doc, so there is a possible match.
225: * Compute the slop while making the match as short as possible by advancing
226: * all subSpans except the last one in reverse order.
227: */
228: private boolean shrinkToAfterShortestMatch() throws IOException {
229: matchStart = subSpans[subSpans.length - 1].start();
230: matchEnd = subSpans[subSpans.length - 1].end();
231: int matchSlop = 0;
232: int lastStart = matchStart;
233: int lastEnd = matchEnd;
234: for (int i = subSpans.length - 2; i >= 0; i--) {
235: Spans prevSpans = subSpans[i];
236: int prevStart = prevSpans.start();
237: int prevEnd = prevSpans.end();
238: while (true) { // Advance prevSpans until after (lastStart, lastEnd)
239: if (!prevSpans.next()) {
240: inSameDoc = false;
241: more = false;
242: break; // Check remaining subSpans for final match.
243: } else if (matchDoc != prevSpans.doc()) {
244: inSameDoc = false; // The last subSpans is not advanced here.
245: break; // Check remaining subSpans for last match in this document.
246: } else {
247: int ppStart = prevSpans.start();
248: int ppEnd = prevSpans.end(); // Cannot avoid invoking .end()
249: if (!docSpansOrdered(ppStart, ppEnd, lastStart,
250: lastEnd)) {
251: break; // Check remaining subSpans.
252: } else { // prevSpans still before (lastStart, lastEnd)
253: prevStart = ppStart;
254: prevEnd = ppEnd;
255: }
256: }
257: }
258: assert prevStart <= matchStart;
259: if (matchStart > prevEnd) { // Only non overlapping spans add to slop.
260: matchSlop += (matchStart - prevEnd);
261: }
262: /* Do not break on (matchSlop > allowedSlop) here to make sure
263: * that subSpans[0] is advanced after the match, if any.
264: */
265: matchStart = prevStart;
266: lastStart = prevStart;
267: lastEnd = prevEnd;
268: }
269: return matchSlop <= allowedSlop; // ordered and allowed slop
270: }
271:
272: public String toString() {
273: return getClass().getName()
274: + "("
275: + query.toString()
276: + ")@"
277: + (firstTime ? "START" : (more ? (doc() + ":" + start()
278: + "-" + end()) : "END"));
279: }
280: }
|