001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.solr.request;
017:
018: import org.apache.lucene.index.Term;
019: import org.apache.lucene.index.TermEnum;
020: import org.apache.lucene.index.IndexReader;
021: import org.apache.lucene.index.TermDocs;
022: import org.apache.lucene.queryParser.ParseException;
023: import org.apache.lucene.search.*;
024: import org.apache.solr.core.SolrCore;
025: import org.apache.solr.core.SolrException;
026: import org.apache.solr.core.SolrConfig;
027: import org.apache.solr.request.SolrParams;
028: import org.apache.solr.schema.IndexSchema;
029: import org.apache.solr.schema.FieldType;
030: import org.apache.solr.schema.SchemaField;
031: import org.apache.solr.schema.BoolField;
032: import org.apache.solr.search.*;
033: import org.apache.solr.util.NamedList;
034: import org.apache.solr.util.BoundedTreeSet;
035: import org.apache.solr.util.SimpleOrderedMap;
036:
037: import java.io.IOException;
038: import java.util.Arrays;
039: import java.util.Comparator;
040:
041: /**
042: * A class that generates simple Facet information for a request.
043: *
044: * More advanced facet implementations may compose or subclass this class
045: * to leverage any of it's functionality.
046: */
047: public class SimpleFacets {
048:
049: /** The main set of documents all facet counts should be relative to */
050: protected DocSet docs;
051: /** Configuration params behavior should be driven by */
052: protected SolrParams params;
053: /** Searcher to use for all calculations */
054: protected SolrIndexSearcher searcher;
055:
056: public SimpleFacets(SolrIndexSearcher searcher, DocSet docs,
057: SolrParams params) {
058: this .searcher = searcher;
059: this .docs = docs;
060: this .params = params;
061: }
062:
063: /**
064: * Looks at various Params to determing if any simple Facet Constraint count
065: * computations are desired.
066: *
067: * @see #getFacetQueryCounts
068: * @see #getFacetFieldCounts
069: * @see SolrParams#FACET
070: * @return a NamedList of Facet Count info or null
071: */
072: public NamedList getFacetCounts() {
073:
074: // if someone called this method, benefit of the doubt: assume true
075: if (!params.getBool(params.FACET, true))
076: return null;
077:
078: NamedList res = new SimpleOrderedMap();
079: try {
080:
081: res.add("facet_queries", getFacetQueryCounts());
082:
083: res.add("facet_fields", getFacetFieldCounts());
084:
085: } catch (Exception e) {
086: SolrException.logOnce(SolrCore.log,
087: "Exception during facet counts", e);
088: res.add("exception", SolrException.toStr(e));
089: }
090: return res;
091: }
092:
093: /**
094: * Returns a list of facet counts for each of the facet queries
095: * specified in the params
096: *
097: * @see SolrParams#FACET_QUERY
098: */
099: public NamedList getFacetQueryCounts() throws IOException,
100: ParseException {
101:
102: NamedList res = new SimpleOrderedMap();
103:
104: /* Ignore SolrParams.DF - could have init param facet.query assuming
105: * the schema default with query param DF intented to only affect Q.
106: * If user doesn't want schema default for facet.query, they should be
107: * explicit.
108: */
109: SolrQueryParser qp = searcher.getSchema().getSolrQueryParser(
110: null);
111:
112: String[] facetQs = params.getParams(SolrParams.FACET_QUERY);
113: if (null != facetQs && 0 != facetQs.length) {
114: for (String q : facetQs) {
115: res.add(q, searcher.numDocs(qp.parse(q), docs));
116: }
117: }
118:
119: return res;
120: }
121:
122: public NamedList getTermCounts(String field) throws IOException {
123: int offset = params.getFieldInt(field, params.FACET_OFFSET, 0);
124: int limit = params.getFieldInt(field, params.FACET_LIMIT, 100);
125: Integer mincount = params.getFieldInt(field,
126: params.FACET_MINCOUNT);
127: if (mincount == null) {
128: Boolean zeros = params.getFieldBool(field,
129: params.FACET_ZEROS);
130: // mincount = (zeros!=null && zeros) ? 0 : 1;
131: mincount = (zeros != null && !zeros) ? 1 : 0;
132: // current default is to include zeros.
133: }
134: boolean missing = params.getFieldBool(field,
135: params.FACET_MISSING, false);
136: // default to sorting if there is a limit.
137: boolean sort = params.getFieldBool(field, params.FACET_SORT,
138: limit > 0);
139: String prefix = params
140: .getFieldParam(field, params.FACET_PREFIX);
141:
142: NamedList counts;
143: SchemaField sf = searcher.getSchema().getField(field);
144: FieldType ft = sf.getType();
145: if (sf.multiValued() || ft.isTokenized()
146: || ft instanceof BoolField) {
147: // Always use filters for booleans... we know the number of values is very small.
148: counts = getFacetTermEnumCounts(searcher, docs, field,
149: offset, limit, mincount, missing, sort, prefix);
150: } else {
151: // TODO: future logic could use filters instead of the fieldcache if
152: // the number of terms in the field is small enough.
153: counts = getFieldCacheCounts(searcher, docs, field, offset,
154: limit, mincount, missing, sort, prefix);
155: }
156:
157: return counts;
158: }
159:
160: /**
161: * Returns a list of value constraints and the associated facet counts
162: * for each facet field specified in the params.
163: *
164: * @see SolrParams#FACET_FIELD
165: * @see #getFieldMissingCount
166: * @see #getFacetTermEnumCounts
167: */
168: public NamedList getFacetFieldCounts() throws IOException {
169:
170: NamedList res = new SimpleOrderedMap();
171: String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
172: if (null != facetFs) {
173: for (String f : facetFs) {
174: res.add(f, getTermCounts(f));
175: }
176: }
177: return res;
178: }
179:
180: /**
181: * Returns a count of the documents in the set which do not have any
182: * terms for for the specified field.
183: *
184: * @see SolrParams#FACET_MISSING
185: */
186: public static int getFieldMissingCount(SolrIndexSearcher searcher,
187: DocSet docs, String fieldName) throws IOException {
188:
189: DocSet hasVal = searcher.getDocSet(new ConstantScoreRangeQuery(
190: fieldName, null, null, false, false));
191: return docs.andNotSize(hasVal);
192: }
193:
194: // first element of the fieldcache is null, so we need this comparator.
195: private static final Comparator nullStrComparator = new Comparator() {
196: public int compare(Object o1, Object o2) {
197: if (o1 == null)
198: return (o2 == null) ? 0 : -1;
199: else if (o2 == null)
200: return 1;
201: return ((String) o1).compareTo((String) o2);
202: }
203: };
204:
205: /**
206: * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
207: * The field must have at most one indexed token per document.
208: */
209: public static NamedList getFieldCacheCounts(
210: SolrIndexSearcher searcher, DocSet docs, String fieldName,
211: int offset, int limit, int mincount, boolean missing,
212: boolean sort, String prefix) throws IOException {
213: // TODO: If the number of terms is high compared to docs.size(), and zeros==false,
214: // we should use an alternate strategy to avoid
215: // 1) creating another huge int[] for the counts
216: // 2) looping over that huge int[] looking for the rare non-zeros.
217: //
218: // Yet another variation: if docs.size() is small and termvectors are stored,
219: // then use them instead of the FieldCache.
220: //
221:
222: // TODO: this function is too big and could use some refactoring, but
223: // we also need a facet cache, and refactoring of SimpleFacets instead of
224: // trying to pass all the various params around.
225:
226: FieldType ft = searcher.getSchema().getFieldType(fieldName);
227: NamedList res = new NamedList();
228:
229: FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(
230: searcher.getReader(), fieldName);
231: final String[] terms = si.lookup;
232: final int[] termNum = si.order;
233:
234: if (prefix != null && prefix.length() == 0)
235: prefix = null;
236:
237: int startTermIndex, endTermIndex;
238: if (prefix != null) {
239: startTermIndex = Arrays.binarySearch(terms, prefix,
240: nullStrComparator);
241: if (startTermIndex < 0)
242: startTermIndex = -startTermIndex - 1;
243: // find the end term. \uffff isn't a legal unicode char, but only compareTo
244: // is used, so it should be fine, and is guaranteed to be bigger than legal chars.
245: endTermIndex = Arrays.binarySearch(terms, prefix
246: + "\uffff\uffff\uffff\uffff", nullStrComparator);
247: endTermIndex = -endTermIndex - 1;
248: } else {
249: startTermIndex = 1;
250: endTermIndex = terms.length;
251: }
252:
253: final int nTerms = endTermIndex - startTermIndex;
254:
255: if (nTerms > 0) {
256:
257: // count collection array only needs to be as big as the number of terms we are
258: // going to collect counts for.
259: final int[] counts = new int[nTerms];
260:
261: DocIterator iter = docs.iterator();
262: while (iter.hasNext()) {
263: int term = termNum[iter.nextDoc()];
264: int arrIdx = term - startTermIndex;
265: if (arrIdx >= 0 && arrIdx < nTerms)
266: counts[arrIdx]++;
267: }
268:
269: // IDEA: we could also maintain a count of "other"... everything that fell outside
270: // of the top 'N'
271:
272: int off = offset;
273: int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
274:
275: if (sort) {
276: int maxsize = limit > 0 ? offset + limit
277: : Integer.MAX_VALUE - 1;
278: maxsize = Math.min(maxsize, nTerms);
279: final BoundedTreeSet<CountPair<String, Integer>> queue = new BoundedTreeSet<CountPair<String, Integer>>(
280: maxsize);
281: int min = mincount - 1; // the smallest value in the top 'N' values
282: for (int i = 0; i < nTerms; i++) {
283: int c = counts[i];
284: if (c > min) {
285: // NOTE: we use c>min rather than c>=min as an optimization because we are going in
286: // index order, so we already know that the keys are ordered. This can be very
287: // important if a lot of the counts are repeated (like zero counts would be).
288: queue.add(new CountPair<String, Integer>(
289: terms[startTermIndex + i], c));
290: if (queue.size() >= maxsize)
291: min = queue.last().val;
292: }
293: }
294: // now select the right page from the results
295: for (CountPair<String, Integer> p : queue) {
296: if (--off >= 0)
297: continue;
298: if (--lim < 0)
299: break;
300: res.add(ft.indexedToReadable(p.key), p.val);
301: }
302: } else {
303: // add results in index order
304: int i = 0;
305: if (mincount <= 0) {
306: // if mincount<=0, then we won't discard any terms and we know exactly
307: // where to start.
308: i = off;
309: off = 0;
310: }
311:
312: for (; i < nTerms; i++) {
313: int c = counts[i];
314: if (c < mincount || --off >= 0)
315: continue;
316: if (--lim < 0)
317: break;
318: res.add(ft.indexedToReadable(terms[startTermIndex
319: + i]), c);
320: }
321: }
322: }
323:
324: if (missing) {
325: res.add(null, getFieldMissingCount(searcher, docs,
326: fieldName));
327: }
328:
329: return res;
330: }
331:
332: /**
333: * Returns a list of terms in the specified field along with the
334: * corresponding count of documents in the set that match that constraint.
335: * This method uses the FilterCache to get the intersection count between <code>docs</code>
336: * and the DocSet for each term in the filter.
337: *
338: * @see SolrParams#FACET_LIMIT
339: * @see SolrParams#FACET_ZEROS
340: * @see SolrParams#FACET_MISSING
341: */
342: public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher,
343: DocSet docs, String field, int offset, int limit,
344: int mincount, boolean missing, boolean sort, String prefix)
345: throws IOException {
346:
347: /* :TODO: potential optimization...
348: * cache the Terms with the highest docFreq and try them first
349: * don't enum if we get our max from them
350: */
351:
352: // Minimum term docFreq in order to use the filterCache for that term.
353: int minDfFilterCache = params.getFieldInt(field,
354: SolrParams.FACET_ENUM_CACHE_MINDF, 0);
355:
356: IndexSchema schema = searcher.getSchema();
357: IndexReader r = searcher.getReader();
358: FieldType ft = schema.getFieldType(field);
359:
360: final int maxsize = limit >= 0 ? offset + limit
361: : Integer.MAX_VALUE - 1;
362: final BoundedTreeSet<CountPair<String, Integer>> queue = sort ? new BoundedTreeSet<CountPair<String, Integer>>(
363: maxsize)
364: : null;
365: final NamedList res = new NamedList();
366:
367: int min = mincount - 1; // the smallest value in the top 'N' values
368: int off = offset;
369: int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
370:
371: String startTerm = prefix == null ? "" : ft.toInternal(prefix);
372: TermEnum te = r.terms(new Term(field, startTerm));
373: TermDocs td = r.termDocs();
374: do {
375: Term t = te.term();
376:
377: if (null == t || !t.field().equals(field))
378: break;
379:
380: if (prefix != null && !t.text().startsWith(prefix))
381: break;
382:
383: int df = te.docFreq();
384:
385: // If we are sorting, we can use df>min (rather than >=) since we
386: // are going in index order. For certain term distributions this can
387: // make a large difference (for example, many terms with df=1).
388: if (df > 0 && df > min) {
389: int c;
390:
391: if (df >= minDfFilterCache) {
392: // use the filter cache
393: c = searcher.numDocs(new TermQuery(t), docs);
394: } else {
395: // iterate over TermDocs to calculate the intersection
396: td.seek(te);
397: c = 0;
398: while (td.next()) {
399: if (docs.exists(td.doc()))
400: c++;
401: }
402: }
403:
404: if (sort) {
405: if (c > min) {
406: queue.add(new CountPair<String, Integer>(t
407: .text(), c));
408: if (queue.size() >= maxsize)
409: min = queue.last().val;
410: }
411: } else {
412: if (c >= mincount && --off < 0) {
413: if (--lim < 0)
414: break;
415: res.add(ft.indexedToReadable(t.text()), c);
416: }
417: }
418: }
419: } while (te.next());
420:
421: if (sort) {
422: for (CountPair<String, Integer> p : queue) {
423: if (--off >= 0)
424: continue;
425: if (--lim < 0)
426: break;
427: res.add(ft.indexedToReadable(p.key), p.val);
428: }
429: }
430:
431: if (missing) {
432: res.add(null, getFieldMissingCount(searcher, docs, field));
433: }
434:
435: te.close();
436: td.close();
437:
438: return res;
439: }
440:
441: /**
442: * A simple key=>val pair whose natural order is such that
443: * <b>higher</b> vals come before lower vals.
444: * In case of tie vals, then <b>lower</b> keys come before higher keys.
445: */
446: public static class CountPair<K extends Comparable<? super K>, V extends Comparable<? super V>>
447: implements Comparable<CountPair<K, V>> {
448:
449: public CountPair(K k, V v) {
450: key = k;
451: val = v;
452: }
453:
454: public K key;
455: public V val;
456:
457: public int hashCode() {
458: return key.hashCode() ^ val.hashCode();
459: }
460:
461: public boolean equals(Object o) {
462: return (o instanceof CountPair)
463: && (0 == this .compareTo((CountPair<K, V>) o));
464: }
465:
466: public int compareTo(CountPair<K, V> o) {
467: int vc = o.val.compareTo(val);
468: return (0 != vc ? vc : key.compareTo(o.key));
469: }
470: }
471: }
|