001: package org.apache.lucene.analysis;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.index.Payload;
021: import org.apache.lucene.index.TermPositions;
022:
023: /** A Token is an occurence of a term from the text of a field. It consists of
024: a term's text, the start and end offset of the term in the text of the field,
025: and a type string.
026: <p>
027: The start and end offsets permit applications to re-associate a token with
028: its source text, e.g., to display highlighted query terms in a document
029: browser, or to show matching text fragments in a KWIC (KeyWord In Context)
030: display, etc.
031: <p>
032: The type is an interned string, assigned by a lexical analyzer
033: (a.k.a. tokenizer), naming the lexical or syntactic class that the token
034: belongs to. For example an end of sentence marker token might be implemented
035: with type "eos". The default token type is "word".
036: <p>
037: A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
038: length byte array. Use {@link TermPositions#getPayloadLength()} and
039: {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
040:
041: <br><br>
042: <p><font color="#FF0000">
043: WARNING: The status of the <b>Payloads</b> feature is experimental.
044: The APIs introduced here might change in the future and will not be
045: supported anymore in such a case.</font>
046:
047: <br><br>
048:
049: <p><b>NOTE:</b> As of 2.3, Token stores the term text
050: internally as a malleable char[] termBuffer instead of
051: String termText. The indexing code and core tokenizers
052: have been changed re-use a single Token instance, changing
053: its buffer and other fields in-place as the Token is
054: processed. This provides substantially better indexing
055: performance as it saves the GC cost of new'ing a Token and
056: String for every term. The APIs that accept String
057: termText are still available but a warning about the
058: associated performance cost has been added (below). The
059: {@link #termText()} method has been deprecated.</p>
060:
061: <p>Tokenizers and filters should try to re-use a Token
062: instance when possible for best performance, by
063: implementing the {@link TokenStream#next(Token)} API.
064: Failing that, to create a new Token you should first use
065: one of the constructors that starts with null text. Then
066: you should call either {@link #termBuffer()} or {@link
067: #resizeTermBuffer(int)} to retrieve the Token's
068: termBuffer. Fill in the characters of your term into this
069: buffer, and finally call {@link #setTermLength(int)} to
070: set the length of the term text. See <a target="_top"
071: href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
072: for details.</p>
073:
074: @see org.apache.lucene.index.Payload
075: */
076: public class Token implements Cloneable {
077:
078: public static final String DEFAULT_TYPE = "word";
079: private static int MIN_BUFFER_SIZE = 10;
080:
081: /** @deprecated: we will remove this when we remove the
082: * deprecated APIs */
083: private String termText;
084:
085: char[] termBuffer; // characters for the term text
086: int termLength; // length of term text in buffer
087:
088: int startOffset; // start in source text
089: int endOffset; // end in source text
090: String type = DEFAULT_TYPE; // lexical type
091:
092: Payload payload;
093:
094: int positionIncrement = 1;
095:
096: /** Constructs a Token will null text. */
097: public Token() {
098: }
099:
100: /** Constructs a Token with null text and start & end
101: * offsets.
102: * @param start start offset
103: * @param end end offset */
104: public Token(int start, int end) {
105: startOffset = start;
106: endOffset = end;
107: }
108:
109: /** Constructs a Token with null text and start & end
110: * offsets plus the Token type.
111: * @param start start offset
112: * @param end end offset */
113: public Token(int start, int end, String typ) {
114: startOffset = start;
115: endOffset = end;
116: type = typ;
117: }
118:
119: /** Constructs a Token with the given term text, and start
120: * & end offsets. The type defaults to "word."
121: * <b>NOTE:</b> for better indexing speed you should
122: * instead use the char[] termBuffer methods to set the
123: * term text.
124: * @param text term text
125: * @param start start offset
126: * @param end end offset */
127: public Token(String text, int start, int end) {
128: termText = text;
129: startOffset = start;
130: endOffset = end;
131: }
132:
133: /** Constructs a Token with the given text, start and end
134: * offsets, & type. <b>NOTE:</b> for better indexing
135: * speed you should instead use the char[] termBuffer
136: * methods to set the term text.
137: * @param text term text
138: * @param start start offset
139: * @param end end offset
140: * @param typ token type */
141: public Token(String text, int start, int end, String typ) {
142: termText = text;
143: startOffset = start;
144: endOffset = end;
145: type = typ;
146: }
147:
148: /** Set the position increment. This determines the position of this token
149: * relative to the previous Token in a {@link TokenStream}, used in phrase
150: * searching.
151: *
152: * <p>The default value is one.
153: *
154: * <p>Some common uses for this are:<ul>
155: *
156: * <li>Set it to zero to put multiple terms in the same position. This is
157: * useful if, e.g., a word has multiple stems. Searches for phrases
158: * including either stem will match. In this case, all but the first stem's
159: * increment should be set to zero: the increment of the first instance
160: * should be one. Repeating a token with an increment of zero can also be
161: * used to boost the scores of matches on that token.
162: *
163: * <li>Set it to values greater than one to inhibit exact phrase matches.
164: * If, for example, one does not want phrases to match across removed stop
165: * words, then one could build a stop word filter that removes stop words and
166: * also sets the increment to the number of stop words removed before each
167: * non-stop word. Then exact phrase queries will only match when the terms
168: * occur with no intervening stop words.
169: *
170: * </ul>
171: * @see org.apache.lucene.index.TermPositions
172: */
173: public void setPositionIncrement(int positionIncrement) {
174: if (positionIncrement < 0)
175: throw new IllegalArgumentException(
176: "Increment must be zero or greater: "
177: + positionIncrement);
178: this .positionIncrement = positionIncrement;
179: }
180:
181: /** Returns the position increment of this Token.
182: * @see #setPositionIncrement
183: */
184: public int getPositionIncrement() {
185: return positionIncrement;
186: }
187:
188: /** Sets the Token's term text. <b>NOTE:</b> for better
189: * indexing speed you should instead use the char[]
190: * termBuffer methods to set the term text. */
191: public void setTermText(String text) {
192: termText = text;
193: termBuffer = null;
194: }
195:
196: /** Returns the Token's term text.
197: *
198: * @deprecated Use {@link #termBuffer()} and {@link
199: * #termLength()} instead. */
200: public final String termText() {
201: if (termText == null && termBuffer != null)
202: termText = new String(termBuffer, 0, termLength);
203: return termText;
204: }
205:
206: /** Copies the contents of buffer, starting at offset for
207: * length characters, into the termBuffer
208: * array. <b>NOTE:</b> for better indexing speed you
209: * should instead retrieve the termBuffer, using {@link
210: * #termBuffer()} or {@link #resizeTermBuffer(int)}, and
211: * fill it in directly to set the term text. This saves
212: * an extra copy. */
213: public final void setTermBuffer(char[] buffer, int offset,
214: int length) {
215: resizeTermBuffer(length);
216: System.arraycopy(buffer, offset, termBuffer, 0, length);
217: termLength = length;
218: }
219:
220: /** Returns the internal termBuffer character array which
221: * you can then directly alter. If the array is too
222: * small for your token, use {@link
223: * #resizeTermBuffer(int)} to increase it. After
224: * altering the buffer be sure to call {@link
225: * #setTermLength} to record the number of valid
226: * characters that were placed into the termBuffer. */
227: public final char[] termBuffer() {
228: initTermBuffer();
229: return termBuffer;
230: }
231:
232: /** Grows the termBuffer to at least size newSize.
233: * @param newSize minimum size of the new termBuffer
234: * @return newly created termBuffer with length >= newSize
235: */
236: public char[] resizeTermBuffer(int newSize) {
237: initTermBuffer();
238: if (newSize > termBuffer.length) {
239: int size = termBuffer.length;
240: while (size < newSize)
241: size *= 2;
242: char[] newBuffer = new char[size];
243: System.arraycopy(termBuffer, 0, newBuffer, 0,
244: termBuffer.length);
245: termBuffer = newBuffer;
246: }
247: return termBuffer;
248: }
249:
250: // TODO: once we remove the deprecated termText() method
251: // and switch entirely to char[] termBuffer we don't need
252: // to use this method anymore
253: private void initTermBuffer() {
254: if (termBuffer == null) {
255: if (termText == null) {
256: termBuffer = new char[MIN_BUFFER_SIZE];
257: termLength = 0;
258: } else {
259: int length = termText.length();
260: if (length < MIN_BUFFER_SIZE)
261: length = MIN_BUFFER_SIZE;
262: termBuffer = new char[length];
263: termLength = termText.length();
264: termText.getChars(0, termText.length(), termBuffer, 0);
265: termText = null;
266: }
267: } else if (termText != null)
268: termText = null;
269: }
270:
271: /** Return number of valid characters (length of the term)
272: * in the termBuffer array. */
273: public final int termLength() {
274: initTermBuffer();
275: return termLength;
276: }
277:
278: /** Set number of valid characters (length of the term) in
279: * the termBuffer array. */
280: public final void setTermLength(int length) {
281: initTermBuffer();
282: termLength = length;
283: }
284:
285: /** Returns this Token's starting offset, the position of the first character
286: corresponding to this token in the source text.
287:
288: Note that the difference between endOffset() and startOffset() may not be
289: equal to termText.length(), as the term text may have been altered by a
290: stemmer or some other filter. */
291: public final int startOffset() {
292: return startOffset;
293: }
294:
295: /** Set the starting offset.
296: @see #startOffset() */
297: public void setStartOffset(int offset) {
298: this .startOffset = offset;
299: }
300:
301: /** Returns this Token's ending offset, one greater than the position of the
302: last character corresponding to this token in the source text. */
303: public final int endOffset() {
304: return endOffset;
305: }
306:
307: /** Set the ending offset.
308: @see #endOffset() */
309: public void setEndOffset(int offset) {
310: this .endOffset = offset;
311: }
312:
313: /** Returns this Token's lexical type. Defaults to "word". */
314: public final String type() {
315: return type;
316: }
317:
318: /** Set the lexical type.
319: @see #type() */
320: public final void setType(String type) {
321: this .type = type;
322: }
323:
324: /**
325: * Returns this Token's payload.
326: */
327: public Payload getPayload() {
328: return this .payload;
329: }
330:
331: /**
332: * Sets this Token's payload.
333: */
334: public void setPayload(Payload payload) {
335: this .payload = payload;
336: }
337:
338: public String toString() {
339: StringBuffer sb = new StringBuffer();
340: sb.append('(');
341: initTermBuffer();
342: if (termBuffer == null)
343: sb.append("null");
344: else
345: sb.append(termBuffer, 0, termLength);
346: sb.append(',').append(startOffset).append(',')
347: .append(endOffset);
348: if (!type.equals("word"))
349: sb.append(",type=").append(type);
350: if (positionIncrement != 1)
351: sb.append(",posIncr=").append(positionIncrement);
352: sb.append(')');
353: return sb.toString();
354: }
355:
356: /** Resets the term text, payload, and positionIncrement to default.
357: * Other fields such as startOffset, endOffset and the token type are
358: * not reset since they are normally overwritten by the tokenizer. */
359: public void clear() {
360: payload = null;
361: // Leave termBuffer to allow re-use
362: termLength = 0;
363: termText = null;
364: positionIncrement = 1;
365: // startOffset = endOffset = 0;
366: // type = DEFAULT_TYPE;
367: }
368:
369: public Object clone() {
370: try {
371: Token t = (Token) super .clone();
372: if (termBuffer != null) {
373: t.termBuffer = null;
374: t.setTermBuffer(termBuffer, 0, termLength);
375: }
376: if (payload != null) {
377: t.setPayload((Payload) payload.clone());
378: }
379: return t;
380: } catch (CloneNotSupportedException e) {
381: throw new RuntimeException(e); // shouldn't happen
382: }
383: }
384: }
|