001: /*
002: * Created on 28-Oct-2004
003: */
004: package org.apache.lucene.search.highlight;
005:
006: import java.io.IOException;
007: import java.io.StringReader;
008: import java.util.ArrayList;
009: import java.util.Arrays;
010: import java.util.Comparator;
011:
012: import org.apache.lucene.analysis.Analyzer;
013: import org.apache.lucene.analysis.Token;
014: import org.apache.lucene.analysis.TokenStream;
015: import org.apache.lucene.document.Document;
016: import org.apache.lucene.index.IndexReader;
017: import org.apache.lucene.index.TermFreqVector;
018: import org.apache.lucene.index.TermPositionVector;
019: import org.apache.lucene.index.TermVectorOffsetInfo;
020:
021: /**
022: * Hides implementation issues associated with obtaining a TokenStream for use with
023: * the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions or
024: * from Analyzer class reparsing the stored content.
025: * @author maharwood
026: */
027: public class TokenSources {
028: /**
029: * A convenience method that tries a number of approaches to getting a token stream.
030: * The cost of finding there are no termVectors in the index is minimal (1000 invocations still
031: * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
032: * @param reader
033: * @param docId
034: * @param field
035: * @param analyzer
036: * @return null if field not stored correctly
037: * @throws IOException
038: */
039: public static TokenStream getAnyTokenStream(IndexReader reader,
040: int docId, String field, Analyzer analyzer)
041: throws IOException {
042: TokenStream ts = null;
043:
044: TermFreqVector tfv = (TermFreqVector) reader.getTermFreqVector(
045: docId, field);
046: if (tfv != null) {
047: if (tfv instanceof TermPositionVector) {
048: ts = getTokenStream((TermPositionVector) tfv);
049: }
050: }
051: //No token info stored so fall back to analyzing raw content
052: if (ts == null) {
053: ts = getTokenStream(reader, docId, field, analyzer);
054: }
055: return ts;
056: }
057:
058: public static TokenStream getTokenStream(TermPositionVector tpv) {
059: //assumes the worst and makes no assumptions about token position sequences.
060: return getTokenStream(tpv, false);
061: }
062:
063: /**
064: * Low level api.
065: * Returns a token stream or null if no offset info available in index.
066: * This can be used to feed the highlighter with a pre-parsed token stream
067: *
068: * In my tests the speeds to recreate 1000 token streams using this method are:
069: * - with TermVector offset only data stored - 420 milliseconds
070: * - with TermVector offset AND position data stored - 271 milliseconds
071: * (nb timings for TermVector with position data are based on a tokenizer with contiguous
072: * positions - no overlaps or gaps)
073: * The cost of not using TermPositionVector to store
074: * pre-parsed content and using an analyzer to re-parse the original content:
075: * - reanalyzing the original content - 980 milliseconds
076: *
077: * The re-analyze timings will typically vary depending on -
078: * 1) The complexity of the analyzer code (timings above were using a
079: * stemmer/lowercaser/stopword combo)
080: * 2) The number of other fields (Lucene reads ALL fields off the disk
081: * when accessing just one document field - can cost dear!)
082: * 3) Use of compression on field storage - could be faster cos of compression (less disk IO)
083: * or slower (more CPU burn) depending on the content.
084: *
085: * @param tpv
086: * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
087: * to eek out the last drops of performance, set to true. If in doubt, set to false.
088: */
089: public static TokenStream getTokenStream(TermPositionVector tpv,
090: boolean tokenPositionsGuaranteedContiguous) {
091: //an object used to iterate across an array of tokens
092: class StoredTokenStream extends TokenStream {
093: Token tokens[];
094: int currentToken = 0;
095:
096: StoredTokenStream(Token tokens[]) {
097: this .tokens = tokens;
098: }
099:
100: public Token next() {
101: if (currentToken >= tokens.length) {
102: return null;
103: }
104: return tokens[currentToken++];
105: }
106: }
107: //code to reconstruct the original sequence of Tokens
108: String[] terms = tpv.getTerms();
109: int[] freq = tpv.getTermFrequencies();
110: int totalTokens = 0;
111: for (int t = 0; t < freq.length; t++) {
112: totalTokens += freq[t];
113: }
114: Token tokensInOriginalOrder[] = new Token[totalTokens];
115: ArrayList unsortedTokens = null;
116: for (int t = 0; t < freq.length; t++) {
117: TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
118: if (offsets == null) {
119: return null;
120: }
121:
122: int[] pos = null;
123: if (tokenPositionsGuaranteedContiguous) {
124: //try get the token position info to speed up assembly of tokens into sorted sequence
125: pos = tpv.getTermPositions(t);
126: }
127: if (pos == null) {
128: //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
129: if (unsortedTokens == null) {
130: unsortedTokens = new ArrayList();
131: }
132: for (int tp = 0; tp < offsets.length; tp++) {
133: unsortedTokens.add(new Token(terms[t], offsets[tp]
134: .getStartOffset(), offsets[tp]
135: .getEndOffset()));
136: }
137: } else {
138: //We have positions stored and a guarantee that the token position information is contiguous
139:
140: // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
141: // creates jumps in position numbers - this code would fail under those circumstances
142:
143: //tokens stored with positions - can use this to index straight into sorted array
144: for (int tp = 0; tp < pos.length; tp++) {
145: tokensInOriginalOrder[pos[tp]] = new Token(
146: terms[t], offsets[tp].getStartOffset(),
147: offsets[tp].getEndOffset());
148: }
149: }
150: }
151: //If the field has been stored without position data we must perform a sort
152: if (unsortedTokens != null) {
153: tokensInOriginalOrder = (Token[]) unsortedTokens
154: .toArray(new Token[unsortedTokens.size()]);
155: Arrays.sort(tokensInOriginalOrder, new Comparator() {
156: public int compare(Object o1, Object o2) {
157: Token t1 = (Token) o1;
158: Token t2 = (Token) o2;
159: if (t1.startOffset() > t2.startOffset())
160: return 1;
161: if (t1.startOffset() < t2.startOffset())
162: return -1;
163: return 0;
164: }
165: });
166: }
167: return new StoredTokenStream(tokensInOriginalOrder);
168: }
169:
170: public static TokenStream getTokenStream(IndexReader reader,
171: int docId, String field) throws IOException {
172: TermFreqVector tfv = (TermFreqVector) reader.getTermFreqVector(
173: docId, field);
174: if (tfv == null) {
175: throw new IllegalArgumentException(field + " in doc #"
176: + docId
177: + "does not have any term position data stored");
178: }
179: if (tfv instanceof TermPositionVector) {
180: TermPositionVector tpv = (TermPositionVector) reader
181: .getTermFreqVector(docId, field);
182: return getTokenStream(tpv);
183: }
184: throw new IllegalArgumentException(field + " in doc #" + docId
185: + "does not have any term position data stored");
186: }
187:
188: //convenience method
189: public static TokenStream getTokenStream(IndexReader reader,
190: int docId, String field, Analyzer analyzer)
191: throws IOException {
192: Document doc = reader.document(docId);
193: String contents = doc.get(field);
194: if (contents == null) {
195: throw new IllegalArgumentException("Field " + field
196: + " in document #" + docId
197: + " is not stored and cannot be analyzed");
198: }
199: return analyzer.tokenStream(field, new StringReader(contents));
200: }
201:
202: }
|