001: /*
002: * Created on 28-Oct-2004
003: */
004: package org.apache.lucene.search.highlight;
005:
006: /**
007: * Licensed to the Apache Software Foundation (ASF) under one or more
008: * contributor license agreements. See the NOTICE file distributed with
009: * this work for additional information regarding copyright ownership.
010: * The ASF licenses this file to You under the Apache License, Version 2.0
011: * (the "License"); you may not use this file except in compliance with
012: * the License. You may obtain a copy of the License at
013: *
014: * http://www.apache.org/licenses/LICENSE-2.0
015: *
016: * Unless required by applicable law or agreed to in writing, software
017: * distributed under the License is distributed on an "AS IS" BASIS,
018: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
019: * See the License for the specific language governing permissions and
020: * limitations under the License.
021: */
022:
023: import java.io.IOException;
024: import java.io.StringReader;
025: import java.util.ArrayList;
026: import java.util.Arrays;
027: import java.util.Comparator;
028:
029: import org.apache.lucene.analysis.Analyzer;
030: import org.apache.lucene.analysis.Token;
031: import org.apache.lucene.analysis.TokenStream;
032: import org.apache.lucene.document.Document;
033: import org.apache.lucene.index.IndexReader;
034: import org.apache.lucene.index.TermFreqVector;
035: import org.apache.lucene.index.TermPositionVector;
036: import org.apache.lucene.index.TermVectorOffsetInfo;
037:
038: /**
039: * Hides implementation issues associated with obtaining a TokenStream for use with
040: * the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions or
041: * from Analyzer class reparsing the stored content.
042: * @author maharwood
043: */
044: public class TokenSources {
045: /**
046: * A convenience method that tries a number of approaches to getting a token stream.
047: * The cost of finding there are no termVectors in the index is minimal (1000 invocations still
048: * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
049: * @param reader
050: * @param docId
051: * @param field
052: * @param analyzer
053: * @return null if field not stored correctly
054: * @throws IOException
055: */
056: public static TokenStream getAnyTokenStream(IndexReader reader,
057: int docId, String field, Analyzer analyzer)
058: throws IOException {
059: TokenStream ts = null;
060:
061: TermFreqVector tfv = (TermFreqVector) reader.getTermFreqVector(
062: docId, field);
063: if (tfv != null) {
064: if (tfv instanceof TermPositionVector) {
065: ts = getTokenStream((TermPositionVector) tfv);
066: }
067: }
068: //No token info stored so fall back to analyzing raw content
069: if (ts == null) {
070: ts = getTokenStream(reader, docId, field, analyzer);
071: }
072: return ts;
073: }
074:
075: public static TokenStream getTokenStream(TermPositionVector tpv) {
076: //assumes the worst and makes no assumptions about token position sequences.
077: return getTokenStream(tpv, false);
078: }
079:
080: /**
081: * Low level api.
082: * Returns a token stream or null if no offset info available in index.
083: * This can be used to feed the highlighter with a pre-parsed token stream
084: *
085: * In my tests the speeds to recreate 1000 token streams using this method are:
086: * - with TermVector offset only data stored - 420 milliseconds
087: * - with TermVector offset AND position data stored - 271 milliseconds
088: * (nb timings for TermVector with position data are based on a tokenizer with contiguous
089: * positions - no overlaps or gaps)
090: * The cost of not using TermPositionVector to store
091: * pre-parsed content and using an analyzer to re-parse the original content:
092: * - reanalyzing the original content - 980 milliseconds
093: *
094: * The re-analyze timings will typically vary depending on -
095: * 1) The complexity of the analyzer code (timings above were using a
096: * stemmer/lowercaser/stopword combo)
097: * 2) The number of other fields (Lucene reads ALL fields off the disk
098: * when accessing just one document field - can cost dear!)
099: * 3) Use of compression on field storage - could be faster cos of compression (less disk IO)
100: * or slower (more CPU burn) depending on the content.
101: *
102: * @param tpv
103: * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
104: * to eek out the last drops of performance, set to true. If in doubt, set to false.
105: */
106: public static TokenStream getTokenStream(TermPositionVector tpv,
107: boolean tokenPositionsGuaranteedContiguous) {
108: //an object used to iterate across an array of tokens
109: class StoredTokenStream extends TokenStream {
110: Token tokens[];
111: int currentToken = 0;
112:
113: StoredTokenStream(Token tokens[]) {
114: this .tokens = tokens;
115: }
116:
117: public Token next() {
118: if (currentToken >= tokens.length) {
119: return null;
120: }
121: return tokens[currentToken++];
122: }
123: }
124: //code to reconstruct the original sequence of Tokens
125: String[] terms = tpv.getTerms();
126: int[] freq = tpv.getTermFrequencies();
127: int totalTokens = 0;
128: for (int t = 0; t < freq.length; t++) {
129: totalTokens += freq[t];
130: }
131: Token tokensInOriginalOrder[] = new Token[totalTokens];
132: ArrayList unsortedTokens = null;
133: for (int t = 0; t < freq.length; t++) {
134: TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
135: if (offsets == null) {
136: return null;
137: }
138:
139: int[] pos = null;
140: if (tokenPositionsGuaranteedContiguous) {
141: //try get the token position info to speed up assembly of tokens into sorted sequence
142: pos = tpv.getTermPositions(t);
143: }
144: if (pos == null) {
145: //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
146: if (unsortedTokens == null) {
147: unsortedTokens = new ArrayList();
148: }
149: for (int tp = 0; tp < offsets.length; tp++) {
150: unsortedTokens.add(new Token(terms[t], offsets[tp]
151: .getStartOffset(), offsets[tp]
152: .getEndOffset()));
153: }
154: } else {
155: //We have positions stored and a guarantee that the token position information is contiguous
156:
157: // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
158: // creates jumps in position numbers - this code would fail under those circumstances
159:
160: //tokens stored with positions - can use this to index straight into sorted array
161: for (int tp = 0; tp < pos.length; tp++) {
162: tokensInOriginalOrder[pos[tp]] = new Token(
163: terms[t], offsets[tp].getStartOffset(),
164: offsets[tp].getEndOffset());
165: }
166: }
167: }
168: //If the field has been stored without position data we must perform a sort
169: if (unsortedTokens != null) {
170: tokensInOriginalOrder = (Token[]) unsortedTokens
171: .toArray(new Token[unsortedTokens.size()]);
172: Arrays.sort(tokensInOriginalOrder, new Comparator() {
173: public int compare(Object o1, Object o2) {
174: Token t1 = (Token) o1;
175: Token t2 = (Token) o2;
176: if (t1.startOffset() > t2.startOffset())
177: return 1;
178: if (t1.startOffset() < t2.startOffset())
179: return -1;
180: return 0;
181: }
182: });
183: }
184: return new StoredTokenStream(tokensInOriginalOrder);
185: }
186:
187: public static TokenStream getTokenStream(IndexReader reader,
188: int docId, String field) throws IOException {
189: TermFreqVector tfv = (TermFreqVector) reader.getTermFreqVector(
190: docId, field);
191: if (tfv == null) {
192: throw new IllegalArgumentException(field + " in doc #"
193: + docId
194: + "does not have any term position data stored");
195: }
196: if (tfv instanceof TermPositionVector) {
197: TermPositionVector tpv = (TermPositionVector) reader
198: .getTermFreqVector(docId, field);
199: return getTokenStream(tpv);
200: }
201: throw new IllegalArgumentException(field + " in doc #" + docId
202: + "does not have any term position data stored");
203: }
204:
205: //convenience method
206: public static TokenStream getTokenStream(IndexReader reader,
207: int docId, String field, Analyzer analyzer)
208: throws IOException {
209: Document doc = reader.document(docId);
210: String contents = doc.get(field);
211: if (contents == null) {
212: throw new IllegalArgumentException("Field " + field
213: + " in document #" + docId
214: + " is not stored and cannot be analyzed");
215: }
216: return analyzer.tokenStream(field, new StringReader(contents));
217: }
218:
219: }
|