001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019: import java.io.IOException;
020: import java.util.BitSet;
021:
022: import org.apache.lucene.index.IndexReader;
023: import org.apache.lucene.index.Term;
024: import org.apache.lucene.index.TermDocs;
025: import org.apache.lucene.index.TermEnum;
026:
027: public class DuplicateFilter extends Filter {
028:
029: String fieldName;
030:
031: /**
032: * KeepMode determines which document id to consider as the master, all others being
033: * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
034: */
035: int keepMode = KM_USE_FIRST_OCCURRENCE;
036: public static final int KM_USE_FIRST_OCCURRENCE = 1;
037: public static final int KM_USE_LAST_OCCURRENCE = 2;
038:
039: /**
040: * "Full" processing mode starts by setting all bits to false and only setting bits
041: * for documents that contain the given field and are identified as none-duplicates.
042:
043: * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
044: * given field. This approach avoids the need to read TermDocs for terms that are seen
045: * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
046: * faster approach , the downside is that bitsets produced will include bits set for
047: * documents that do not actually contain the field given.
048: *
049: */
050: int processingMode = PM_FULL_VALIDATION;
051: public static final int PM_FULL_VALIDATION = 1;
052: public static final int PM_FAST_INVALIDATION = 2;
053:
054: public DuplicateFilter(String fieldName) {
055: this (fieldName, KM_USE_LAST_OCCURRENCE, PM_FULL_VALIDATION);
056: }
057:
058: public DuplicateFilter(String fieldName, int keepMode,
059: int processingMode) {
060: this .fieldName = fieldName;
061: this .keepMode = keepMode;
062: this .processingMode = processingMode;
063: }
064:
065: public BitSet bits(IndexReader reader) throws IOException {
066: if (processingMode == PM_FAST_INVALIDATION) {
067: return fastBits(reader);
068: } else {
069: return correctBits(reader);
070: }
071: }
072:
073: private BitSet correctBits(IndexReader reader) throws IOException {
074:
075: BitSet bits = new BitSet(reader.maxDoc()); //assume all are INvalid
076: Term startTerm = new Term(fieldName, "");
077: TermEnum te = reader.terms(startTerm);
078: if (te != null) {
079: Term currTerm = te.term();
080: while ((currTerm != null)
081: && (currTerm.field() == startTerm.field())) //term fieldnames are interned
082: {
083: int lastDoc = -1;
084: //set non duplicates
085: TermDocs td = reader.termDocs(currTerm);
086: if (td.next()) {
087: if (keepMode == KM_USE_FIRST_OCCURRENCE) {
088: bits.set(td.doc());
089: } else {
090: do {
091: lastDoc = td.doc();
092: } while (td.next());
093: bits.set(lastDoc);
094: }
095: }
096: if (!te.next()) {
097: break;
098: }
099: currTerm = te.term();
100: }
101: }
102: return bits;
103: }
104:
105: private BitSet fastBits(IndexReader reader) throws IOException {
106:
107: BitSet bits = new BitSet(reader.maxDoc());
108: bits.set(0, reader.maxDoc()); //assume all are valid
109: Term startTerm = new Term(fieldName, "");
110: TermEnum te = reader.terms(startTerm);
111: if (te != null) {
112: Term currTerm = te.term();
113:
114: while ((currTerm != null)
115: && (currTerm.field() == startTerm.field())) //term fieldnames are interned
116: {
117: if (te.docFreq() > 1) {
118: int lastDoc = -1;
119: //unset potential duplicates
120: TermDocs td = reader.termDocs(currTerm);
121: td.next();
122: if (keepMode == KM_USE_FIRST_OCCURRENCE) {
123: td.next();
124: }
125: do {
126: lastDoc = td.doc();
127: bits.set(lastDoc, false);
128: } while (td.next());
129: if (keepMode == KM_USE_LAST_OCCURRENCE) {
130: //restore the last bit
131: bits.set(lastDoc);
132: }
133: }
134: if (!te.next()) {
135: break;
136: }
137: currTerm = te.term();
138: }
139: }
140: return bits;
141: }
142:
143: /**
144: * @param args
145: * @throws IOException
146: * @throws Exception
147: */
148: public static void main(String[] args) throws Exception {
149: IndexReader r = IndexReader.open("/indexes/personCentricAnon");
150: // IndexReader r=IndexReader.open("/indexes/enron");
151: long start = System.currentTimeMillis();
152: // DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
153: // DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
154: DuplicateFilter df = new DuplicateFilter("vehicle.vrm",
155: KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
156: // DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
157: // df.setProcessingMode(PM_SLOW_VALIDATION);
158: BitSet b = df.bits(r);
159: long end = System.currentTimeMillis() - start;
160: System.out.println(b.cardinality() + " in " + end + " ms ");
161:
162: }
163:
164: public String getFieldName() {
165: return fieldName;
166: }
167:
168: public void setFieldName(String fieldName) {
169: this .fieldName = fieldName;
170: }
171:
172: public int getKeepMode() {
173: return keepMode;
174: }
175:
176: public void setKeepMode(int keepMode) {
177: this .keepMode = keepMode;
178: }
179:
180: public boolean equals(Object obj) {
181: if (this == obj)
182: return true;
183: if ((obj == null) || (obj.getClass() != this .getClass()))
184: return false;
185: DuplicateFilter other = (DuplicateFilter) obj;
186: return keepMode == other.keepMode
187: && processingMode == other.processingMode
188: && (fieldName == other.fieldName || (fieldName != null && fieldName
189: .equals(other.fieldName)));
190: }
191:
192: public int hashCode() {
193: int hash = 217;
194: hash = 31 * hash + keepMode;
195: hash = 31 * hash + processingMode;
196: hash = 31 * hash + fieldName.hashCode();
197: return hash;
198: }
199:
200: public int getProcessingMode() {
201: return processingMode;
202: }
203:
204: public void setProcessingMode(int processingMode) {
205: this.processingMode = processingMode;
206: }
207:
208: }
|