Source Code Cross Referenced for FuzzyLikeThisQuery.java in » Net » lucene-connector » org » apache » lucene » search » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Net » lucene connector » org.apache.lucene.search

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        package org.apache.lucene.search;
002:
003:        /**
004:         * Licensed to the Apache Software Foundation (ASF) under one or more
005:         * contributor license agreements.  See the NOTICE file distributed with
006:         * this work for additional information regarding copyright ownership.
007:         * The ASF licenses this file to You under the Apache License, Version 2.0
008:         * (the "License"); you may not use this file except in compliance with
009:         * the License.  You may obtain a copy of the License at
010:         *
011:         *     http://www.apache.org/licenses/LICENSE-2.0
012:         *
013:         * Unless required by applicable law or agreed to in writing, software
014:         * distributed under the License is distributed on an "AS IS" BASIS,
015:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016:         * See the License for the specific language governing permissions and
017:         * limitations under the License.
018:         */
019:
020:        import java.io.IOException;
021:        import java.io.StringReader;
022:        import java.util.ArrayList;
023:        import java.util.HashMap;
024:        import java.util.HashSet;
025:        import java.util.Iterator;
026:
027:        import org.apache.lucene.analysis.Analyzer;
028:        import org.apache.lucene.analysis.Token;
029:        import org.apache.lucene.analysis.TokenStream;
030:        import org.apache.lucene.index.IndexReader;
031:        import org.apache.lucene.index.Term;
032:        import org.apache.lucene.index.TermEnum;
033:        import org.apache.lucene.util.PriorityQueue;
034:
035:        /**
036:         * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
037:         * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
038:         * of fuzzy scoring factors.
039:         * This generally produces good results for queries where users may provide details in a number of 
040:         * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
041:         * a fast query.
042:         * 
043:         * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
044:         * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
045:         * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer 
046:         * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query 
047:         * term) and this is factored into the variant's boost. If the source query term does not exist in the
048:         * index the average IDF of the variants is used. 
049:         * @author maharwood
050:         */
051:        public class FuzzyLikeThisQuery extends Query {
052:            static Similarity sim = new DefaultSimilarity();
053:            Query rewrittenQuery = null;
054:            ArrayList fieldVals = new ArrayList();
055:            Analyzer analyzer;
056:
057:            ScoreTermQueue q;
058:            int MAX_VARIANTS_PER_TERM = 50;
059:            boolean ignoreTF = false;
060:
061:            /**
062:             * 
063:             * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
064:             * @param analyzer
065:             */
066:            public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) {
067:                q = new ScoreTermQueue(maxNumTerms);
068:                this .analyzer = analyzer;
069:            }
070:
071:            class FieldVals {
072:                String queryString;
073:                String fieldName;
074:                float minSimilarity;
075:                int prefixLength;
076:
077:                public FieldVals(String name, float similarity, int length,
078:                        String queryString) {
079:                    fieldName = name;
080:                    minSimilarity = similarity;
081:                    prefixLength = length;
082:                    this .queryString = queryString;
083:                }
084:
085:            }
086:
087:            /**
088:             * Adds user input for "fuzzification" 
089:             * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
090:             * @param fieldName
091:             * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
092:             * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
093:             */
094:            public void addTerms(String queryString, String fieldName,
095:                    float minSimilarity, int prefixLength) {
096:                fieldVals.add(new FieldVals(fieldName, minSimilarity,
097:                        prefixLength, queryString));
098:            }
099:
100:            private void addTerms(IndexReader reader, FieldVals f)
101:                    throws IOException {
102:                if (f.queryString == null)
103:                    return;
104:                TokenStream ts = analyzer.tokenStream(f.fieldName,
105:                        new StringReader(f.queryString));
106:                Token token = ts.next();
107:                int corpusNumDocs = reader.numDocs();
108:                Term internSavingTemplateTerm = new Term(f.fieldName, ""); //optimization to avoid constructing new Term() objects
109:                HashSet processedTerms = new HashSet();
110:                while (token != null) {
111:                    if (!processedTerms.contains(token.termText())) {
112:                        processedTerms.add(token.termText());
113:                        ScoreTermQueue variantsQ = new ScoreTermQueue(
114:                                MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
115:                        float minScore = 0;
116:                        Term startTerm = internSavingTemplateTerm
117:                                .createTerm(token.termText());
118:                        FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm,
119:                                f.minSimilarity, f.prefixLength);
120:                        TermEnum origEnum = reader.terms(startTerm);
121:                        int df = 0;
122:                        if (startTerm.equals(origEnum.term())) {
123:                            df = origEnum.docFreq(); //store the df so all variants use same idf
124:                        }
125:                        int numVariants = 0;
126:                        int totalVariantDocFreqs = 0;
127:                        do {
128:                            Term possibleMatch = fe.term();
129:                            if (possibleMatch != null) {
130:                                numVariants++;
131:                                totalVariantDocFreqs += fe.docFreq();
132:                                float score = fe.difference();
133:                                if (variantsQ.size() < MAX_VARIANTS_PER_TERM
134:                                        || score > minScore) {
135:                                    ScoreTerm st = new ScoreTerm(possibleMatch,
136:                                            score, startTerm);
137:                                    variantsQ.insert(st);
138:                                    minScore = ((ScoreTerm) variantsQ.top()).score; // maintain minScore
139:                                }
140:                            }
141:                        } while (fe.next());
142:                        if (numVariants == 0) {
143:                            //no variants to rank here
144:                            break;
145:                        }
146:                        int avgDf = totalVariantDocFreqs / numVariants;
147:                        if (df == 0)//no direct match we can use as df for all variants 
148:                        {
149:                            df = avgDf; //use avg df of all variants
150:                        }
151:
152:                        // take the top variants (scored by edit distance) and reset the score
153:                        // to include an IDF factor then add to the global queue for ranking overall top query terms
154:                        int size = variantsQ.size();
155:                        for (int i = 0; i < size; i++) {
156:                            ScoreTerm st = (ScoreTerm) variantsQ.pop();
157:                            st.score = (st.score * st.score)
158:                                    * sim.idf(df, corpusNumDocs);
159:                            q.insert(st);
160:                        }
161:                    }
162:                    token = ts.next();
163:                }
164:            }
165:
166:            public Query rewrite(IndexReader reader) throws IOException {
167:                if (rewrittenQuery != null) {
168:                    return rewrittenQuery;
169:                }
170:                //load up the list of possible terms
171:                for (Iterator iter = fieldVals.iterator(); iter.hasNext();) {
172:                    FieldVals f = (FieldVals) iter.next();
173:                    addTerms(reader, f);
174:                }
175:                //clear the list of fields
176:                fieldVals.clear();
177:
178:                BooleanQuery bq = new BooleanQuery();
179:
180:                //create BooleanQueries to hold the variants for each token/field pair and ensure it
181:                // has no coord factor
182:                //Step 1: sort the termqueries by term/field
183:                HashMap variantQueries = new HashMap();
184:                int size = q.size();
185:                for (int i = 0; i < size; i++) {
186:                    ScoreTerm st = (ScoreTerm) q.pop();
187:                    ArrayList l = (ArrayList) variantQueries
188:                            .get(st.fuzziedSourceTerm);
189:                    if (l == null) {
190:                        l = new ArrayList();
191:                        variantQueries.put(st.fuzziedSourceTerm, l);
192:                    }
193:                    l.add(st);
194:                }
195:                //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
196:                for (Iterator iter = variantQueries.values().iterator(); iter
197:                        .hasNext();) {
198:                    ArrayList variants = (ArrayList) iter.next();
199:                    if (variants.size() == 1) {
200:                        //optimize where only one selected variant
201:                        ScoreTerm st = (ScoreTerm) variants.get(0);
202:                        TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF);
203:                        tq.setBoost(st.score); // set the boost to a mix of IDF and score
204:                        bq.add(tq, BooleanClause.Occur.SHOULD);
205:                    } else {
206:                        BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
207:                        for (Iterator iterator2 = variants.iterator(); iterator2
208:                                .hasNext();) {
209:                            ScoreTerm st = (ScoreTerm) iterator2.next();
210:                            TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF); // found a match
211:                            tq.setBoost(st.score); // set the boost using the ScoreTerm's score
212:                            termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query                    
213:                        }
214:                        bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
215:                    }
216:                }
217:                //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
218:                // booleans with a minimum-should-match of NumFields-1?
219:                bq.setBoost(getBoost());
220:                this .rewrittenQuery = bq;
221:                return bq;
222:            }
223:
224:            //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
225:            // term variants) then is reset with IDF for use in ranking against all other
226:            // terms/fields
227:            private static class ScoreTerm {
228:                public Term term;
229:                public float score;
230:                Term fuzziedSourceTerm;
231:
232:                public ScoreTerm(Term term, float score, Term fuzziedSourceTerm) {
233:                    this .term = term;
234:                    this .score = score;
235:                    this .fuzziedSourceTerm = fuzziedSourceTerm;
236:                }
237:            }
238:
239:            private static class ScoreTermQueue extends PriorityQueue {
240:                public ScoreTermQueue(int size) {
241:                    initialize(size);
242:                }
243:
244:                /* (non-Javadoc)
245:                 * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
246:                 */
247:                protected boolean lessThan(Object a, Object b) {
248:                    ScoreTerm termA = (ScoreTerm) a;
249:                    ScoreTerm termB = (ScoreTerm) b;
250:                    if (termA.score == termB.score)
251:                        return termA.term.compareTo(termB.term) > 0;
252:                    else
253:                        return termA.score < termB.score;
254:                }
255:
256:            }
257:
258:            //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
259:            private static class FuzzyTermQuery extends TermQuery {
260:                boolean ignoreTF;
261:
262:                public FuzzyTermQuery(Term t, boolean ignoreTF) {
263:                    super (t);
264:                    this .ignoreTF = ignoreTF;
265:                }
266:
267:                public Similarity getSimilarity(Searcher searcher) {
268:                    Similarity result = super .getSimilarity(searcher);
269:                    result = new SimilarityDelegator(result) {
270:
271:                        public float tf(float freq) {
272:                            if (ignoreTF) {
273:                                return 1; //ignore tf
274:                            }
275:                            return super .tf(freq);
276:                        }
277:
278:                        public float idf(int docFreq, int numDocs) {
279:                            //IDF is already factored into individual term boosts
280:                            return 1;
281:                        }
282:                    };
283:                    return result;
284:                }
285:            }
286:
287:            /* (non-Javadoc)
288:             * @see org.apache.lucene.search.Query#toString(java.lang.String)
289:             */
290:            public String toString(String field) {
291:                return null;
292:            }
293:
294:            public boolean isIgnoreTF() {
295:                return ignoreTF;
296:            }
297:
298:            public void setIgnoreTF(boolean ignoreTF) {
299:                this.ignoreTF = ignoreTF;
300:            }
301:
302:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.