Source Code Cross Referenced for SimpleFacets.java in  » Search-Engine » apache-solr-1.2.0 » org » apache » solr » request » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » apache solr 1.2.0 » org.apache.solr.request 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /**
002:         * Licensed to the Apache Software Foundation (ASF) under one or more
003:         * contributor license agreements.  See the NOTICE file distributed with
004:         * this work for additional information regarding copyright ownership.
005:         * The ASF licenses this file to You under the Apache License, Version 2.0
006:         * (the "License"); you may not use this file except in compliance with
007:         * the License.  You may obtain a copy of the License at
008:         *
009:         *     http://www.apache.org/licenses/LICENSE-2.0
010:         *
011:         * Unless required by applicable law or agreed to in writing, software
012:         * distributed under the License is distributed on an "AS IS" BASIS,
013:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014:         * See the License for the specific language governing permissions and
015:         * limitations under the License.
016:         */package org.apache.solr.request;
017:
018:        import org.apache.lucene.index.Term;
019:        import org.apache.lucene.index.TermEnum;
020:        import org.apache.lucene.index.IndexReader;
021:        import org.apache.lucene.index.TermDocs;
022:        import org.apache.lucene.queryParser.ParseException;
023:        import org.apache.lucene.search.*;
024:        import org.apache.solr.core.SolrCore;
025:        import org.apache.solr.core.SolrException;
026:        import org.apache.solr.core.SolrConfig;
027:        import org.apache.solr.request.SolrParams;
028:        import org.apache.solr.schema.IndexSchema;
029:        import org.apache.solr.schema.FieldType;
030:        import org.apache.solr.schema.SchemaField;
031:        import org.apache.solr.schema.BoolField;
032:        import org.apache.solr.search.*;
033:        import org.apache.solr.util.NamedList;
034:        import org.apache.solr.util.BoundedTreeSet;
035:        import org.apache.solr.util.SimpleOrderedMap;
036:
037:        import java.io.IOException;
038:        import java.util.Arrays;
039:        import java.util.Comparator;
040:
041:        /**
042:         * A class that generates simple Facet information for a request.
043:         *
044:         * More advanced facet implementations may compose or subclass this class 
045:         * to leverage any of it's functionality.
046:         */
047:        public class SimpleFacets {
048:
049:            /** The main set of documents all facet counts should be relative to */
050:            protected DocSet docs;
051:            /** Configuration params behavior should be driven by */
052:            protected SolrParams params;
053:            /** Searcher to use for all calculations */
054:            protected SolrIndexSearcher searcher;
055:
056:            public SimpleFacets(SolrIndexSearcher searcher, DocSet docs,
057:                    SolrParams params) {
058:                this .searcher = searcher;
059:                this .docs = docs;
060:                this .params = params;
061:            }
062:
063:            /**
064:             * Looks at various Params to determing if any simple Facet Constraint count
065:             * computations are desired.
066:             *
067:             * @see #getFacetQueryCounts
068:             * @see #getFacetFieldCounts
069:             * @see SolrParams#FACET
070:             * @return a NamedList of Facet Count info or null
071:             */
072:            public NamedList getFacetCounts() {
073:
074:                // if someone called this method, benefit of the doubt: assume true
075:                if (!params.getBool(params.FACET, true))
076:                    return null;
077:
078:                NamedList res = new SimpleOrderedMap();
079:                try {
080:
081:                    res.add("facet_queries", getFacetQueryCounts());
082:
083:                    res.add("facet_fields", getFacetFieldCounts());
084:
085:                } catch (Exception e) {
086:                    SolrException.logOnce(SolrCore.log,
087:                            "Exception during facet counts", e);
088:                    res.add("exception", SolrException.toStr(e));
089:                }
090:                return res;
091:            }
092:
093:            /**
094:             * Returns a list of facet counts for each of the facet queries 
095:             * specified in the params
096:             *
097:             * @see SolrParams#FACET_QUERY
098:             */
099:            public NamedList getFacetQueryCounts() throws IOException,
100:                    ParseException {
101:
102:                NamedList res = new SimpleOrderedMap();
103:
104:                /* Ignore SolrParams.DF - could have init param facet.query assuming
105:                 * the schema default with query param DF intented to only affect Q.
106:                 * If user doesn't want schema default for facet.query, they should be
107:                 * explicit.
108:                 */
109:                SolrQueryParser qp = searcher.getSchema().getSolrQueryParser(
110:                        null);
111:
112:                String[] facetQs = params.getParams(SolrParams.FACET_QUERY);
113:                if (null != facetQs && 0 != facetQs.length) {
114:                    for (String q : facetQs) {
115:                        res.add(q, searcher.numDocs(qp.parse(q), docs));
116:                    }
117:                }
118:
119:                return res;
120:            }
121:
122:            public NamedList getTermCounts(String field) throws IOException {
123:                int offset = params.getFieldInt(field, params.FACET_OFFSET, 0);
124:                int limit = params.getFieldInt(field, params.FACET_LIMIT, 100);
125:                Integer mincount = params.getFieldInt(field,
126:                        params.FACET_MINCOUNT);
127:                if (mincount == null) {
128:                    Boolean zeros = params.getFieldBool(field,
129:                            params.FACET_ZEROS);
130:                    // mincount = (zeros!=null && zeros) ? 0 : 1;
131:                    mincount = (zeros != null && !zeros) ? 1 : 0;
132:                    // current default is to include zeros.
133:                }
134:                boolean missing = params.getFieldBool(field,
135:                        params.FACET_MISSING, false);
136:                // default to sorting if there is a limit.
137:                boolean sort = params.getFieldBool(field, params.FACET_SORT,
138:                        limit > 0);
139:                String prefix = params
140:                        .getFieldParam(field, params.FACET_PREFIX);
141:
142:                NamedList counts;
143:                SchemaField sf = searcher.getSchema().getField(field);
144:                FieldType ft = sf.getType();
145:                if (sf.multiValued() || ft.isTokenized()
146:                        || ft instanceof  BoolField) {
147:                    // Always use filters for booleans... we know the number of values is very small.
148:                    counts = getFacetTermEnumCounts(searcher, docs, field,
149:                            offset, limit, mincount, missing, sort, prefix);
150:                } else {
151:                    // TODO: future logic could use filters instead of the fieldcache if
152:                    // the number of terms in the field is small enough.
153:                    counts = getFieldCacheCounts(searcher, docs, field, offset,
154:                            limit, mincount, missing, sort, prefix);
155:                }
156:
157:                return counts;
158:            }
159:
160:            /**
161:             * Returns a list of value constraints and the associated facet counts 
162:             * for each facet field specified in the params.
163:             *
164:             * @see SolrParams#FACET_FIELD
165:             * @see #getFieldMissingCount
166:             * @see #getFacetTermEnumCounts
167:             */
168:            public NamedList getFacetFieldCounts() throws IOException {
169:
170:                NamedList res = new SimpleOrderedMap();
171:                String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
172:                if (null != facetFs) {
173:                    for (String f : facetFs) {
174:                        res.add(f, getTermCounts(f));
175:                    }
176:                }
177:                return res;
178:            }
179:
180:            /**
181:             * Returns a count of the documents in the set which do not have any 
182:             * terms for for the specified field.
183:             *
184:             * @see SolrParams#FACET_MISSING
185:             */
186:            public static int getFieldMissingCount(SolrIndexSearcher searcher,
187:                    DocSet docs, String fieldName) throws IOException {
188:
189:                DocSet hasVal = searcher.getDocSet(new ConstantScoreRangeQuery(
190:                        fieldName, null, null, false, false));
191:                return docs.andNotSize(hasVal);
192:            }
193:
194:            // first element of the fieldcache is null, so we need this comparator.
195:            private static final Comparator nullStrComparator = new Comparator() {
196:                public int compare(Object o1, Object o2) {
197:                    if (o1 == null)
198:                        return (o2 == null) ? 0 : -1;
199:                    else if (o2 == null)
200:                        return 1;
201:                    return ((String) o1).compareTo((String) o2);
202:                }
203:            };
204:
205:            /**
206:             * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
207:             * The field must have at most one indexed token per document.
208:             */
209:            public static NamedList getFieldCacheCounts(
210:                    SolrIndexSearcher searcher, DocSet docs, String fieldName,
211:                    int offset, int limit, int mincount, boolean missing,
212:                    boolean sort, String prefix) throws IOException {
213:                // TODO: If the number of terms is high compared to docs.size(), and zeros==false,
214:                //  we should use an alternate strategy to avoid
215:                //  1) creating another huge int[] for the counts
216:                //  2) looping over that huge int[] looking for the rare non-zeros.
217:                //
218:                // Yet another variation: if docs.size() is small and termvectors are stored,
219:                // then use them instead of the FieldCache.
220:                //
221:
222:                // TODO: this function is too big and could use some refactoring, but
223:                // we also need a facet cache, and refactoring of SimpleFacets instead of
224:                // trying to pass all the various params around.
225:
226:                FieldType ft = searcher.getSchema().getFieldType(fieldName);
227:                NamedList res = new NamedList();
228:
229:                FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(
230:                        searcher.getReader(), fieldName);
231:                final String[] terms = si.lookup;
232:                final int[] termNum = si.order;
233:
234:                if (prefix != null && prefix.length() == 0)
235:                    prefix = null;
236:
237:                int startTermIndex, endTermIndex;
238:                if (prefix != null) {
239:                    startTermIndex = Arrays.binarySearch(terms, prefix,
240:                            nullStrComparator);
241:                    if (startTermIndex < 0)
242:                        startTermIndex = -startTermIndex - 1;
243:                    // find the end term.  \uffff isn't a legal unicode char, but only compareTo
244:                    // is used, so it should be fine, and is guaranteed to be bigger than legal chars.
245:                    endTermIndex = Arrays.binarySearch(terms, prefix
246:                            + "\uffff\uffff\uffff\uffff", nullStrComparator);
247:                    endTermIndex = -endTermIndex - 1;
248:                } else {
249:                    startTermIndex = 1;
250:                    endTermIndex = terms.length;
251:                }
252:
253:                final int nTerms = endTermIndex - startTermIndex;
254:
255:                if (nTerms > 0) {
256:
257:                    // count collection array only needs to be as big as the number of terms we are
258:                    // going to collect counts for.
259:                    final int[] counts = new int[nTerms];
260:
261:                    DocIterator iter = docs.iterator();
262:                    while (iter.hasNext()) {
263:                        int term = termNum[iter.nextDoc()];
264:                        int arrIdx = term - startTermIndex;
265:                        if (arrIdx >= 0 && arrIdx < nTerms)
266:                            counts[arrIdx]++;
267:                    }
268:
269:                    // IDEA: we could also maintain a count of "other"... everything that fell outside
270:                    // of the top 'N'
271:
272:                    int off = offset;
273:                    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
274:
275:                    if (sort) {
276:                        int maxsize = limit > 0 ? offset + limit
277:                                : Integer.MAX_VALUE - 1;
278:                        maxsize = Math.min(maxsize, nTerms);
279:                        final BoundedTreeSet<CountPair<String, Integer>> queue = new BoundedTreeSet<CountPair<String, Integer>>(
280:                                maxsize);
281:                        int min = mincount - 1; // the smallest value in the top 'N' values
282:                        for (int i = 0; i < nTerms; i++) {
283:                            int c = counts[i];
284:                            if (c > min) {
285:                                // NOTE: we use c>min rather than c>=min as an optimization because we are going in
286:                                // index order, so we already know that the keys are ordered.  This can be very
287:                                // important if a lot of the counts are repeated (like zero counts would be).
288:                                queue.add(new CountPair<String, Integer>(
289:                                        terms[startTermIndex + i], c));
290:                                if (queue.size() >= maxsize)
291:                                    min = queue.last().val;
292:                            }
293:                        }
294:                        // now select the right page from the results
295:                        for (CountPair<String, Integer> p : queue) {
296:                            if (--off >= 0)
297:                                continue;
298:                            if (--lim < 0)
299:                                break;
300:                            res.add(ft.indexedToReadable(p.key), p.val);
301:                        }
302:                    } else {
303:                        // add results in index order
304:                        int i = 0;
305:                        if (mincount <= 0) {
306:                            // if mincount<=0, then we won't discard any terms and we know exactly
307:                            // where to start.
308:                            i = off;
309:                            off = 0;
310:                        }
311:
312:                        for (; i < nTerms; i++) {
313:                            int c = counts[i];
314:                            if (c < mincount || --off >= 0)
315:                                continue;
316:                            if (--lim < 0)
317:                                break;
318:                            res.add(ft.indexedToReadable(terms[startTermIndex
319:                                    + i]), c);
320:                        }
321:                    }
322:                }
323:
324:                if (missing) {
325:                    res.add(null, getFieldMissingCount(searcher, docs,
326:                            fieldName));
327:                }
328:
329:                return res;
330:            }
331:
332:            /**
333:             * Returns a list of terms in the specified field along with the 
334:             * corresponding count of documents in the set that match that constraint.
335:             * This method uses the FilterCache to get the intersection count between <code>docs</code>
336:             * and the DocSet for each term in the filter.
337:             *
338:             * @see SolrParams#FACET_LIMIT
339:             * @see SolrParams#FACET_ZEROS
340:             * @see SolrParams#FACET_MISSING
341:             */
342:            public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher,
343:                    DocSet docs, String field, int offset, int limit,
344:                    int mincount, boolean missing, boolean sort, String prefix)
345:                    throws IOException {
346:
347:                /* :TODO: potential optimization...
348:                 * cache the Terms with the highest docFreq and try them first
349:                 * don't enum if we get our max from them
350:                 */
351:
352:                // Minimum term docFreq in order to use the filterCache for that term.
353:                int minDfFilterCache = params.getFieldInt(field,
354:                        SolrParams.FACET_ENUM_CACHE_MINDF, 0);
355:
356:                IndexSchema schema = searcher.getSchema();
357:                IndexReader r = searcher.getReader();
358:                FieldType ft = schema.getFieldType(field);
359:
360:                final int maxsize = limit >= 0 ? offset + limit
361:                        : Integer.MAX_VALUE - 1;
362:                final BoundedTreeSet<CountPair<String, Integer>> queue = sort ? new BoundedTreeSet<CountPair<String, Integer>>(
363:                        maxsize)
364:                        : null;
365:                final NamedList res = new NamedList();
366:
367:                int min = mincount - 1; // the smallest value in the top 'N' values    
368:                int off = offset;
369:                int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
370:
371:                String startTerm = prefix == null ? "" : ft.toInternal(prefix);
372:                TermEnum te = r.terms(new Term(field, startTerm));
373:                TermDocs td = r.termDocs();
374:                do {
375:                    Term t = te.term();
376:
377:                    if (null == t || !t.field().equals(field))
378:                        break;
379:
380:                    if (prefix != null && !t.text().startsWith(prefix))
381:                        break;
382:
383:                    int df = te.docFreq();
384:
385:                    // If we are sorting, we can use df>min (rather than >=) since we
386:                    // are going in index order.  For certain term distributions this can
387:                    // make a large difference (for example, many terms with df=1).
388:                    if (df > 0 && df > min) {
389:                        int c;
390:
391:                        if (df >= minDfFilterCache) {
392:                            // use the filter cache
393:                            c = searcher.numDocs(new TermQuery(t), docs);
394:                        } else {
395:                            // iterate over TermDocs to calculate the intersection
396:                            td.seek(te);
397:                            c = 0;
398:                            while (td.next()) {
399:                                if (docs.exists(td.doc()))
400:                                    c++;
401:                            }
402:                        }
403:
404:                        if (sort) {
405:                            if (c > min) {
406:                                queue.add(new CountPair<String, Integer>(t
407:                                        .text(), c));
408:                                if (queue.size() >= maxsize)
409:                                    min = queue.last().val;
410:                            }
411:                        } else {
412:                            if (c >= mincount && --off < 0) {
413:                                if (--lim < 0)
414:                                    break;
415:                                res.add(ft.indexedToReadable(t.text()), c);
416:                            }
417:                        }
418:                    }
419:                } while (te.next());
420:
421:                if (sort) {
422:                    for (CountPair<String, Integer> p : queue) {
423:                        if (--off >= 0)
424:                            continue;
425:                        if (--lim < 0)
426:                            break;
427:                        res.add(ft.indexedToReadable(p.key), p.val);
428:                    }
429:                }
430:
431:                if (missing) {
432:                    res.add(null, getFieldMissingCount(searcher, docs, field));
433:                }
434:
435:                te.close();
436:                td.close();
437:
438:                return res;
439:            }
440:
441:            /**
442:             * A simple key=>val pair whose natural order is such that 
443:             * <b>higher</b> vals come before lower vals.
444:             * In case of tie vals, then <b>lower</b> keys come before higher keys.
445:             */
446:            public static class CountPair<K extends Comparable<? super  K>, V extends Comparable<? super  V>>
447:                    implements  Comparable<CountPair<K, V>> {
448:
449:                public CountPair(K k, V v) {
450:                    key = k;
451:                    val = v;
452:                }
453:
454:                public K key;
455:                public V val;
456:
457:                public int hashCode() {
458:                    return key.hashCode() ^ val.hashCode();
459:                }
460:
461:                public boolean equals(Object o) {
462:                    return (o instanceof  CountPair)
463:                            && (0 == this .compareTo((CountPair<K, V>) o));
464:                }
465:
466:                public int compareTo(CountPair<K, V> o) {
467:                    int vc = o.val.compareTo(val);
468:                    return (0 != vc ? vc : key.compareTo(o.key));
469:                }
470:            }
471:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.