001: package org.apache.lucene.document;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.TokenStream;
021: import org.apache.lucene.index.IndexWriter; // for javadoc
022: import org.apache.lucene.util.Parameter;
023:
024: import java.io.Reader;
025: import java.io.Serializable;
026:
027: /**
028: A field is a section of a Document. Each field has two parts, a name and a
029: value. Values may be free text, provided as a String or as a Reader, or they
030: may be atomic keywords, which are not further processed. Such keywords may
031: be used to represent dates, urls, etc. Fields are optionally stored in the
032: index, so that they may be returned with hits on the document.
033: */
034:
035: public final class Field extends AbstractField implements Fieldable,
036: Serializable {
037:
038: /** Specifies whether and how a field should be stored. */
039: public static final class Store extends Parameter implements
040: Serializable {
041:
042: private Store(String name) {
043: super (name);
044: }
045:
046: /** Store the original field value in the index in a compressed form. This is
047: * useful for long documents and for binary valued fields.
048: */
049: public static final Store COMPRESS = new Store("COMPRESS");
050:
051: /** Store the original field value in the index. This is useful for short texts
052: * like a document's title which should be displayed with the results. The
053: * value is stored in its original form, i.e. no analyzer is used before it is
054: * stored.
055: */
056: public static final Store YES = new Store("YES");
057:
058: /** Do not store the field value in the index. */
059: public static final Store NO = new Store("NO");
060: }
061:
062: /** Specifies whether and how a field should be indexed. */
063: public static final class Index extends Parameter implements
064: Serializable {
065:
066: private Index(String name) {
067: super (name);
068: }
069:
070: /** Do not index the field value. This field can thus not be searched,
071: * but one can still access its contents provided it is
072: * {@link Field.Store stored}. */
073: public static final Index NO = new Index("NO");
074:
075: /** Index the field's value so it can be searched. An Analyzer will be used
076: * to tokenize and possibly further normalize the text before its
077: * terms will be stored in the index. This is useful for common text.
078: */
079: public static final Index TOKENIZED = new Index("TOKENIZED");
080:
081: /** Index the field's value without using an Analyzer, so it can be searched.
082: * As no analyzer is used the value will be stored as a single term. This is
083: * useful for unique Ids like product numbers.
084: */
085: public static final Index UN_TOKENIZED = new Index(
086: "UN_TOKENIZED");
087:
088: /** Index the field's value without an Analyzer, and disable
089: * the storing of norms. No norms means that index-time boosting
090: * and field length normalization will be disabled. The benefit is
091: * less memory usage as norms take up one byte per indexed field
092: * for every document in the index.
093: * Note that once you index a given field <i>with</i> norms enabled,
094: * disabling norms will have no effect. In other words, for NO_NORMS
095: * to have the above described effect on a field, all instances of that
096: * field must be indexed with NO_NORMS from the beginning.
097: */
098: public static final Index NO_NORMS = new Index("NO_NORMS");
099:
100: }
101:
102: /** Specifies whether and how a field should have term vectors. */
103: public static final class TermVector extends Parameter implements
104: Serializable {
105:
106: private TermVector(String name) {
107: super (name);
108: }
109:
110: /** Do not store term vectors.
111: */
112: public static final TermVector NO = new TermVector("NO");
113:
114: /** Store the term vectors of each document. A term vector is a list
115: * of the document's terms and their number of occurences in that document. */
116: public static final TermVector YES = new TermVector("YES");
117:
118: /**
119: * Store the term vector + token position information
120: *
121: * @see #YES
122: */
123: public static final TermVector WITH_POSITIONS = new TermVector(
124: "WITH_POSITIONS");
125:
126: /**
127: * Store the term vector + Token offset information
128: *
129: * @see #YES
130: */
131: public static final TermVector WITH_OFFSETS = new TermVector(
132: "WITH_OFFSETS");
133:
134: /**
135: * Store the term vector + Token position and offset information
136: *
137: * @see #YES
138: * @see #WITH_POSITIONS
139: * @see #WITH_OFFSETS
140: */
141: public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector(
142: "WITH_POSITIONS_OFFSETS");
143: }
144:
145: /** The value of the field as a String, or null. If null, the Reader value,
146: * binary value, or TokenStream value is used. Exactly one of stringValue(),
147: * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
148: public String stringValue() {
149: return fieldsData instanceof String ? (String) fieldsData
150: : null;
151: }
152:
153: /** The value of the field as a Reader, or null. If null, the String value,
154: * binary value, or TokenStream value is used. Exactly one of stringValue(),
155: * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
156: public Reader readerValue() {
157: return fieldsData instanceof Reader ? (Reader) fieldsData
158: : null;
159: }
160:
161: /** The value of the field in Binary, or null. If null, the Reader value,
162: * String value, or TokenStream value is used. Exactly one of stringValue(),
163: * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
164: public byte[] binaryValue() {
165: return fieldsData instanceof byte[] ? (byte[]) fieldsData
166: : null;
167: }
168:
169: /** The value of the field as a TokesStream, or null. If null, the Reader value,
170: * String value, or binary value is used. Exactly one of stringValue(),
171: * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
172: public TokenStream tokenStreamValue() {
173: return fieldsData instanceof TokenStream ? (TokenStream) fieldsData
174: : null;
175: }
176:
177: /** <p>Expert: change the value of this field. This can
178: * be used during indexing to re-use a single Field
179: * instance to improve indexing speed by avoiding GC cost
180: * of new'ing and reclaiming Field instances. Typically
181: * a single {@link Document} instance is re-used as
182: * well. This helps most on small documents.</p>
183: *
184: * <p>Note that you should only use this method after the
185: * Field has been consumed (ie, the {@link Document}
186: * containing this Field has been added to the index).
187: * Also, each Field instance should only be used once
188: * within a single {@link Document} instance. See <a
189: * href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
190: * for details.</p> */
191: public void setValue(String value) {
192: fieldsData = value;
193: }
194:
195: /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
196: public void setValue(Reader value) {
197: fieldsData = value;
198: }
199:
200: /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
201: public void setValue(byte[] value) {
202: fieldsData = value;
203: }
204:
205: /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
206: public void setValue(TokenStream value) {
207: fieldsData = value;
208: }
209:
210: /**
211: * Create a field by specifying its name, value and how it will
212: * be saved in the index. Term vectors will not be stored in the index.
213: *
214: * @param name The name of the field
215: * @param value The string to process
216: * @param store Whether <code>value</code> should be stored in the index
217: * @param index Whether the field should be indexed, and if so, if it should
218: * be tokenized before indexing
219: * @throws NullPointerException if name or value is <code>null</code>
220: * @throws IllegalArgumentException if the field is neither stored nor indexed
221: */
222: public Field(String name, String value, Store store, Index index) {
223: this (name, value, store, index, TermVector.NO);
224: }
225:
226: /**
227: * Create a field by specifying its name, value and how it will
228: * be saved in the index.
229: *
230: * @param name The name of the field
231: * @param value The string to process
232: * @param store Whether <code>value</code> should be stored in the index
233: * @param index Whether the field should be indexed, and if so, if it should
234: * be tokenized before indexing
235: * @param termVector Whether term vector should be stored
236: * @throws NullPointerException if name or value is <code>null</code>
237: * @throws IllegalArgumentException in any of the following situations:
238: * <ul>
239: * <li>the field is neither stored nor indexed</li>
240: * <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
241: * </ul>
242: */
243: public Field(String name, String value, Store store, Index index,
244: TermVector termVector) {
245: if (name == null)
246: throw new NullPointerException("name cannot be null");
247: if (value == null)
248: throw new NullPointerException("value cannot be null");
249: if (name.length() == 0 && value.length() == 0)
250: throw new IllegalArgumentException(
251: "name and value cannot both be empty");
252: if (index == Index.NO && store == Store.NO)
253: throw new IllegalArgumentException(
254: "it doesn't make sense to have a field that "
255: + "is neither indexed nor stored");
256: if (index == Index.NO && termVector != TermVector.NO)
257: throw new IllegalArgumentException(
258: "cannot store term vector information "
259: + "for a field that is not indexed");
260:
261: this .name = name.intern(); // field names are interned
262: this .fieldsData = value;
263:
264: if (store == Store.YES) {
265: this .isStored = true;
266: this .isCompressed = false;
267: } else if (store == Store.COMPRESS) {
268: this .isStored = true;
269: this .isCompressed = true;
270: } else if (store == Store.NO) {
271: this .isStored = false;
272: this .isCompressed = false;
273: } else
274: throw new IllegalArgumentException(
275: "unknown store parameter " + store);
276:
277: if (index == Index.NO) {
278: this .isIndexed = false;
279: this .isTokenized = false;
280: } else if (index == Index.TOKENIZED) {
281: this .isIndexed = true;
282: this .isTokenized = true;
283: } else if (index == Index.UN_TOKENIZED) {
284: this .isIndexed = true;
285: this .isTokenized = false;
286: } else if (index == Index.NO_NORMS) {
287: this .isIndexed = true;
288: this .isTokenized = false;
289: this .omitNorms = true;
290: } else {
291: throw new IllegalArgumentException(
292: "unknown index parameter " + index);
293: }
294:
295: this .isBinary = false;
296:
297: setStoreTermVector(termVector);
298: }
299:
300: /**
301: * Create a tokenized and indexed field that is not stored. Term vectors will
302: * not be stored. The Reader is read only when the Document is added to the index,
303: * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
304: * has been called.
305: *
306: * @param name The name of the field
307: * @param reader The reader with the content
308: * @throws NullPointerException if name or reader is <code>null</code>
309: */
310: public Field(String name, Reader reader) {
311: this (name, reader, TermVector.NO);
312: }
313:
314: /**
315: * Create a tokenized and indexed field that is not stored, optionally with
316: * storing term vectors. The Reader is read only when the Document is added to the index,
317: * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
318: * has been called.
319: *
320: * @param name The name of the field
321: * @param reader The reader with the content
322: * @param termVector Whether term vector should be stored
323: * @throws NullPointerException if name or reader is <code>null</code>
324: */
325: public Field(String name, Reader reader, TermVector termVector) {
326: if (name == null)
327: throw new NullPointerException("name cannot be null");
328: if (reader == null)
329: throw new NullPointerException("reader cannot be null");
330:
331: this .name = name.intern(); // field names are interned
332: this .fieldsData = reader;
333:
334: this .isStored = false;
335: this .isCompressed = false;
336:
337: this .isIndexed = true;
338: this .isTokenized = true;
339:
340: this .isBinary = false;
341:
342: setStoreTermVector(termVector);
343: }
344:
345: /**
346: * Create a tokenized and indexed field that is not stored. Term vectors will
347: * not be stored. This is useful for pre-analyzed fields.
348: * The TokenStream is read only when the Document is added to the index,
349: * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
350: * has been called.
351: *
352: * @param name The name of the field
353: * @param tokenStream The TokenStream with the content
354: * @throws NullPointerException if name or tokenStream is <code>null</code>
355: */
356: public Field(String name, TokenStream tokenStream) {
357: this (name, tokenStream, TermVector.NO);
358: }
359:
360: /**
361: * Create a tokenized and indexed field that is not stored, optionally with
362: * storing term vectors. This is useful for pre-analyzed fields.
363: * The TokenStream is read only when the Document is added to the index,
364: * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
365: * has been called.
366: *
367: * @param name The name of the field
368: * @param tokenStream The TokenStream with the content
369: * @param termVector Whether term vector should be stored
370: * @throws NullPointerException if name or tokenStream is <code>null</code>
371: */
372: public Field(String name, TokenStream tokenStream,
373: TermVector termVector) {
374: if (name == null)
375: throw new NullPointerException("name cannot be null");
376: if (tokenStream == null)
377: throw new NullPointerException("tokenStream cannot be null");
378:
379: this .name = name.intern(); // field names are interned
380: this .fieldsData = tokenStream;
381:
382: this .isStored = false;
383: this .isCompressed = false;
384:
385: this .isIndexed = true;
386: this .isTokenized = true;
387:
388: this .isBinary = false;
389:
390: setStoreTermVector(termVector);
391: }
392:
393: /**
394: * Create a stored field with binary value. Optionally the value may be compressed.
395: *
396: * @param name The name of the field
397: * @param value The binary value
398: * @param store How <code>value</code> should be stored (compressed or not)
399: * @throws IllegalArgumentException if store is <code>Store.NO</code>
400: */
401: public Field(String name, byte[] value, Store store) {
402: if (name == null)
403: throw new IllegalArgumentException("name cannot be null");
404: if (value == null)
405: throw new IllegalArgumentException("value cannot be null");
406:
407: this .name = name.intern();
408: this .fieldsData = value;
409:
410: if (store == Store.YES) {
411: this .isStored = true;
412: this .isCompressed = false;
413: } else if (store == Store.COMPRESS) {
414: this .isStored = true;
415: this .isCompressed = true;
416: } else if (store == Store.NO)
417: throw new IllegalArgumentException(
418: "binary values can't be unstored");
419: else
420: throw new IllegalArgumentException(
421: "unknown store parameter " + store);
422:
423: this .isIndexed = false;
424: this .isTokenized = false;
425:
426: this .isBinary = true;
427:
428: setStoreTermVector(TermVector.NO);
429: }
430:
431: }
|