001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.solr.util;
017:
018: import java.io.IOException;
019: import java.io.StringReader;
020: import java.util.HashMap;
021: import java.util.HashSet;
022: import java.util.Map;
023: import java.util.Set;
024: import java.util.List;
025: import java.util.LinkedList;
026: import java.util.ArrayList;
027: import java.util.ListIterator;
028:
029: import org.apache.solr.request.*;
030: import org.apache.solr.search.DocIterator;
031: import org.apache.solr.search.DocList;
032: import org.apache.solr.search.SolrIndexSearcher;
033: import org.apache.solr.schema.SchemaField;
034:
035: import org.apache.lucene.analysis.*;
036: import org.apache.lucene.document.Document;
037: import org.apache.lucene.search.Query;
038: import org.apache.lucene.search.highlight.*;
039:
040: /**
041: * Collection of Utility and Factory methods for Highlighting.
042: */
043: public class HighlightingUtils {
044: private static final String SIMPLE = "simple";
045:
046: private static final String HIGHLIGHT = "hl";
047: private static final String PREFIX = "hl.";
048: private static final String FIELDS = PREFIX + "fl";
049: private static final String SNIPPETS = PREFIX + "snippets";
050: private static final String FRAGSIZE = PREFIX + "fragsize";
051: private static final String FORMATTER = PREFIX + "formatter";
052: private static final String SIMPLE_PRE = PREFIX + SIMPLE + ".pre";
053: private static final String SIMPLE_POST = PREFIX + SIMPLE + ".post";
054: private static final String FIELD_MATCH = PREFIX
055: + "requireFieldMatch";
056:
057: private static SolrParams DEFAULTS = null;
058: static {
059: Map<String, String> map = new HashMap<String, String>();
060: map.put(SNIPPETS, "1");
061: map.put(FRAGSIZE, "100");
062: map.put(FORMATTER, SIMPLE);
063: map.put(SIMPLE_PRE, "<em>");
064: map.put(SIMPLE_POST, "</em>");
065:
066: DEFAULTS = new MapSolrParams(map);
067: }
068:
069: /** Combine request parameters with highlighting defaults. */
070: private static SolrParams getParams(SolrQueryRequest request) {
071: return new DefaultSolrParams(request.getParams(), DEFAULTS);
072: }
073:
074: /**
075: * Check whether Highlighting is enabled for this request.
076: * @param request The current SolrQueryRequest
077: * @return <code>true</code> if highlighting enabled, <code>false</code> if not.
078: */
079: public static boolean isHighlightingEnabled(SolrQueryRequest request) {
080: return getParams(request).getBool(HIGHLIGHT, false);
081: }
082:
083: /**
084: * Return a Highlighter appropriate for this field.
085: * @param query The current Query
086: * @param fieldName The name of the field
087: * @param request The current SolrQueryRequest
088: */
089: public static Highlighter getHighlighter(Query query,
090: String fieldName, SolrQueryRequest request) {
091: Highlighter highlighter = new Highlighter(getFormatter(
092: fieldName, request), getQueryScorer(query, fieldName,
093: request));
094: highlighter
095: .setTextFragmenter(getFragmenter(fieldName, request));
096: return highlighter;
097: }
098:
099: /**
100: * Return a QueryScorer suitable for this Query and field.
101: * @param query The current query
102: * @param fieldName The name of the field
103: * @param request The SolrQueryRequest
104: */
105: public static QueryScorer getQueryScorer(Query query,
106: String fieldName, SolrQueryRequest request) {
107: boolean reqFieldMatch = getParams(request).getFieldBool(
108: fieldName, FIELD_MATCH, false);
109: if (reqFieldMatch) {
110: return new QueryScorer(query, request.getSearcher()
111: .getReader(), fieldName);
112: } else {
113: return new QueryScorer(query);
114: }
115: }
116:
117: /**
118: * Return a String array of the fields to be highlighted.
119: * Falls back to the programatic defaults, or the default search field if the list of fields
120: * is not specified in either the handler configuration or the request.
121: * @param query The current Query
122: * @param request The current SolrQueryRequest
123: * @param defaultFields Programmatic default highlight fields, used if nothing is specified in the handler config or the request.
124: */
125: public static String[] getHighlightFields(Query query,
126: SolrQueryRequest request, String[] defaultFields) {
127: String fields[] = getParams(request).getParams(FIELDS);
128:
129: // if no fields specified in the request, or the handler, fall back to programmatic default, or default search field.
130: if (emptyArray(fields)) {
131: // use default search field if highlight fieldlist not specified.
132: if (emptyArray(defaultFields)) {
133: fields = new String[] { request.getSchema()
134: .getDefaultSearchFieldName() };
135: } else {
136: fields = defaultFields;
137: }
138: } else if (fields.length == 1) {
139: // if there's a single request/handler value, it may be a space/comma separated list
140: fields = SolrPluginUtils.split(fields[0]);
141: }
142:
143: return fields;
144: }
145:
146: private static boolean emptyArray(String[] arr) {
147: return (arr == null || arr.length == 0 || arr[0] == null || arr[0]
148: .trim().length() == 0);
149: }
150:
151: /**
152: * Return the max number of snippets for this field. If this has not
153: * been configured for this field, fall back to the configured default
154: * or the solr default.
155: * @param fieldName The name of the field
156: * @param request The current SolrQueryRequest
157: */
158: public static int getMaxSnippets(String fieldName,
159: SolrQueryRequest request) {
160: return Integer.parseInt(getParams(request).getFieldParam(
161: fieldName, SNIPPETS));
162: }
163:
164: /**
165: * Return a formatter appropriate for this field. If a formatter
166: * has not been configured for this field, fall back to the configured
167: * default or the solr default (SimpleHTMLFormatter).
168: *
169: * @param fieldName The name of the field
170: * @param request The current SolrQueryRequest
171: * @return An appropriate Formatter.
172: */
173: public static Formatter getFormatter(String fieldName,
174: SolrQueryRequest request) {
175: SolrParams p = getParams(request);
176:
177: // SimpleHTMLFormatter is the only supported Formatter at the moment
178: return new SimpleHTMLFormatter(p.getFieldParam(fieldName,
179: SIMPLE_PRE), p.getFieldParam(fieldName, SIMPLE_POST));
180: }
181:
182: /**
183: * Return a fragmenter appropriate for this field. If a fragmenter
184: * has not been configured for this field, fall back to the configured
185: * default or the solr default (GapFragmenter).
186: *
187: * @param fieldName The name of the field
188: * @param request The current SolrQueryRequest
189: * @return An appropriate Fragmenter.
190: */
191: public static Fragmenter getFragmenter(String fieldName,
192: SolrQueryRequest request) {
193: int fragsize = Integer.parseInt(getParams(request)
194: .getFieldParam(fieldName, FRAGSIZE));
195: return (fragsize <= 0) ? new NullFragmenter()
196: : new GapFragmenter(fragsize);
197: }
198:
199: /**
200: * Generates a list of Highlighted query fragments for each item in a list
201: * of documents, or returns null if highlighting is disabled.
202: *
203: * @param docs query results
204: * @param query the query
205: * @param req the current request
206: * @param defaultFields default list of fields to summarize
207: *
208: * @return NamedList containing a NamedList for each document, which in
209: * turns contains sets (field, summary) pairs.
210: */
211: public static NamedList doHighlighting(DocList docs, Query query,
212: SolrQueryRequest req, String[] defaultFields)
213: throws IOException {
214: if (!isHighlightingEnabled(req))
215: return null;
216:
217: SolrIndexSearcher searcher = req.getSearcher();
218: NamedList fragments = new SimpleOrderedMap();
219: String[] fieldNames = getHighlightFields(query, req,
220: defaultFields);
221: Document[] readDocs = new Document[docs.size()];
222: {
223: // pre-fetch documents using the Searcher's doc cache
224: Set<String> fset = new HashSet<String>();
225: for (String f : fieldNames) {
226: fset.add(f);
227: }
228: // fetch unique key if one exists.
229: SchemaField keyField = req.getSearcher().getSchema()
230: .getUniqueKeyField();
231: if (null != keyField)
232: fset.add(keyField.getName());
233: searcher.readDocs(readDocs, docs, fset);
234: }
235:
236: // Highlight each document
237: DocIterator iterator = docs.iterator();
238: for (int i = 0; i < docs.size(); i++) {
239: int docId = iterator.nextDoc();
240: Document doc = readDocs[i];
241: NamedList docSummaries = new SimpleOrderedMap();
242: for (String fieldName : fieldNames) {
243: fieldName = fieldName.trim();
244: String[] docTexts = doc.getValues(fieldName);
245: if (docTexts == null)
246: continue;
247:
248: // get highlighter, and number of fragments for this field
249: Highlighter highlighter = getHighlighter(query,
250: fieldName, req);
251: int numFragments = getMaxSnippets(fieldName, req);
252:
253: String[] summaries;
254: TextFragment[] frag;
255: if (docTexts.length == 1) {
256: // single-valued field
257: TokenStream tstream;
258: try {
259: // attempt term vectors
260: tstream = TokenSources.getTokenStream(searcher
261: .getReader(), docId, fieldName);
262: } catch (IllegalArgumentException e) {
263: // fall back to analyzer
264: tstream = new TokenOrderingFilter(searcher
265: .getSchema().getAnalyzer().tokenStream(
266: fieldName,
267: new StringReader(docTexts[0])),
268: 10);
269: }
270: frag = highlighter.getBestTextFragments(tstream,
271: docTexts[0], false, numFragments);
272: } else {
273: // multi-valued field
274: MultiValueTokenStream tstream;
275: tstream = new MultiValueTokenStream(fieldName,
276: docTexts, searcher.getSchema()
277: .getAnalyzer(), true);
278: frag = highlighter.getBestTextFragments(tstream,
279: tstream.asSingleValue(), false,
280: numFragments);
281: }
282: // convert fragments back into text
283: // TODO: we can include score and position information in output as snippet attributes
284: if (frag.length > 0) {
285: ArrayList<String> fragTexts = new ArrayList<String>();
286: for (int j = 0; j < frag.length; j++) {
287: if ((frag[j] != null)
288: && (frag[j].getScore() > 0)) {
289: fragTexts.add(frag[j].toString());
290: }
291: }
292: summaries = fragTexts.toArray(new String[0]);
293: if (summaries.length > 0)
294: docSummaries.add(fieldName, summaries);
295: }
296: }
297: String printId = searcher.getSchema().printableUniqueKey(
298: doc);
299: fragments.add(printId == null ? null : printId,
300: docSummaries);
301: }
302: return fragments;
303: }
304: }
305:
306: /**
307: * Helper class which creates a single TokenStream out of values from a
308: * multi-valued field.
309: */
310: class MultiValueTokenStream extends TokenStream {
311: private String fieldName;
312: private String[] values;
313: private Analyzer analyzer;
314: private int curIndex; // next index into the values array
315: private int curOffset; // offset into concatenated string
316: private TokenStream currentStream; // tokenStream currently being iterated
317: private boolean orderTokenOffsets;
318:
319: /** Constructs a TokenStream for consecutively-analyzed field values
320: *
321: * @param fieldName name of the field
322: * @param values array of field data
323: * @param analyzer analyzer instance
324: */
325: public MultiValueTokenStream(String fieldName, String[] values,
326: Analyzer analyzer, boolean orderTokenOffsets) {
327: this .fieldName = fieldName;
328: this .values = values;
329: this .analyzer = analyzer;
330: curIndex = -1;
331: curOffset = 0;
332: currentStream = null;
333: this .orderTokenOffsets = orderTokenOffsets;
334: }
335:
336: /** Returns the next token in the stream, or null at EOS. */
337: public Token next() throws IOException {
338: int extra = 0;
339: if (currentStream == null) {
340: curIndex++;
341: if (curIndex < values.length) {
342: currentStream = analyzer.tokenStream(fieldName,
343: new StringReader(values[curIndex]));
344: if (orderTokenOffsets)
345: currentStream = new TokenOrderingFilter(
346: currentStream, 10);
347: // add extra space between multiple values
348: if (curIndex > 0)
349: extra = analyzer.getPositionIncrementGap(fieldName);
350: } else {
351: return null;
352: }
353: }
354: Token nextToken = currentStream.next();
355: if (nextToken == null) {
356: curOffset += values[curIndex].length();
357: currentStream = null;
358: return next();
359: }
360: // create an modified token which is the offset into the concatenated
361: // string of all values
362: Token offsetToken = new Token(nextToken.termText(), nextToken
363: .startOffset()
364: + curOffset, nextToken.endOffset() + curOffset);
365: offsetToken.setPositionIncrement(nextToken
366: .getPositionIncrement()
367: + extra * 10);
368: return offsetToken;
369: }
370:
371: /**
372: * Returns all values as a single String into which the Tokens index with
373: * their offsets.
374: */
375: public String asSingleValue() {
376: StringBuilder sb = new StringBuilder();
377: for (String str : values)
378: sb.append(str);
379: return sb.toString();
380: }
381:
382: }
383:
384: /**
385: * A simple modification of SimpleFragmenter which additionally creates new
386: * fragments when an unusually-large position increment is encountered
387: * (this behaves much better in the presence of multi-valued fields).
388: */
389: class GapFragmenter extends SimpleFragmenter {
390: /**
391: * When a gap in term positions is observed that is at least this big, treat
392: * the gap as a fragment delimiter.
393: */
394: public static final int INCREMENT_THRESHOLD = 50;
395: protected int fragOffsetAccum = 0;
396:
397: public GapFragmenter() {
398: }
399:
400: public GapFragmenter(int fragsize) {
401: super (fragsize);
402: }
403:
404: /* (non-Javadoc)
405: * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
406: */
407: public void start(String originalText) {
408: fragOffsetAccum = 0;
409: }
410:
411: /* (non-Javadoc)
412: * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
413: */
414: public boolean isNewFragment(Token token) {
415: boolean isNewFrag = token.endOffset() >= fragOffsetAccum
416: + getFragmentSize()
417: || token.getPositionIncrement() > INCREMENT_THRESHOLD;
418: if (isNewFrag) {
419: fragOffsetAccum += token.endOffset() - fragOffsetAccum;
420: }
421: return isNewFrag;
422: }
423: }
424:
425: /** Orders Tokens in a window first by their startOffset ascending.
426: * endOffset is currently ignored.
427: * This is meant to work around fickleness in the highlighter only. It
428: * can mess up token positions and should not be used for indexing or querying.
429: */
430: class TokenOrderingFilter extends TokenFilter {
431: private final int windowSize;
432: private final LinkedList<Token> queue = new LinkedList<Token>();
433: private boolean done = false;
434:
435: protected TokenOrderingFilter(TokenStream input, int windowSize) {
436: super (input);
437: this .windowSize = windowSize;
438: }
439:
440: public Token next() throws IOException {
441: while (!done && queue.size() < windowSize) {
442: Token newTok = input.next();
443: if (newTok == null) {
444: done = true;
445: break;
446: }
447:
448: // reverse iterating for better efficiency since we know the
449: // list is already sorted, and most token start offsets will be too.
450: ListIterator<Token> iter = queue.listIterator(queue.size());
451: while (iter.hasPrevious()) {
452: if (newTok.startOffset() >= iter.previous()
453: .startOffset()) {
454: // insertion will be before what next() would return (what
455: // we just compared against), so move back one so the insertion
456: // will be after.
457: iter.next();
458: break;
459: }
460: }
461: iter.add(newTok);
462: }
463:
464: return queue.isEmpty() ? null : queue.removeFirst();
465: }
466: }
|