001: package org.enhydra.snapper.wrapper.lucene;
002:
003: /**
004:
005: */
006:
007: import java.io.Reader;
008: import java.util.Date;
009:
010: /**
011: A field is a section of a Document. Each field has two parts, a name and a
012: value. Values may be free text, provided as a String or as a Reader, or they
013: may be atomic keywords, which are not further processed. Such keywords may
014: be used to represent dates, urls, etc. Fields are optionally stored in the
015: index, so that they may be returned with hits on the document.
016: */
017:
018: public final class Field implements java.io.Serializable {
019: private String name = "body";
020: private String stringValue = null;
021: private boolean storeTermVector = false;
022: private Reader readerValue = null;
023: private boolean isStored = false;
024: private boolean isIndexed = true;
025: private boolean isTokenized = true;
026:
027: private float boost = 1.0f;
028:
029: /** Sets the boost factor hits on this field. This value will be
030: * multiplied into the score of all hits on this this field of this
031: * document.
032: *
033: * <p>The boost is multiplied by {@link Document#getBoost()} of the document
034: * containing this field. If a document has multiple fields with the same
035: * name, all such values are multiplied together. This product is then
036: * multipled by the value {@link Similarity#lengthNorm(String,int)}, and
037: * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
038: * index. One should attempt to ensure that this product does not overflow
039: * the range of that encoding.
040: *
041: * @see Document#setBoost(float)
042: * @see Similarity#lengthNorm(String, int)
043: * @see Similarity#encodeNorm(float)
044: */
045: public void setBoost(float boost) {
046: this .boost = boost;
047: }
048:
049: /** Returns the boost factor for hits on any field of this document.
050: *
051: * <p>The default value is 1.0.
052: *
053: * <p>Note: this value is not stored directly with the document in the index.
054: * Documents returned from {@link IndexReader#document(int)} and {@link
055: * Hits#doc(int)} may thus not have the same value present as when this field
056: * was indexed.
057: *
058: * @see #setBoost(float)
059: */
060: public float getBoost() {
061: return boost;
062: }
063:
064: /** Constructs a String-valued Field that is not tokenized, but is indexed
065: and stored. Useful for non-text fields, e.g. date or url.
066: */
067: public static final Field Keyword(String name, String value) {
068: return new Field(name, value, true, true, false);
069: }
070:
071: /** Constructs a String-valued Field that is not tokenized nor indexed,
072: but is stored in the index, for return with hits. */
073: public static final Field UnIndexed(String name, String value) {
074: return new Field(name, value, true, false, false);
075: }
076:
077: /** Constructs a String-valued Field that is tokenized and indexed,
078: and is stored in the index, for return with hits. Useful for short text
079: fields, like "title" or "subject". Term vector will not be stored for this field. */
080: public static final Field Text(String name, String value) {
081: return Text(name, value, false);
082: }
083:
084: /** Constructs a Date-valued Field that is not tokenized and is indexed,
085: and stored in the index, for return with hits. */
086: public static final Field Keyword(String name, Date value) {
087: return new Field(name, DateField.dateToString(value), true,
088: true, false);
089: }
090:
091: /** Constructs a String-valued Field that is tokenized and indexed,
092: and is stored in the index, for return with hits. Useful for short text
093: fields, like "title" or "subject". */
094: public static final Field Text(String name, String value,
095: boolean storeTermVector) {
096: return new Field(name, value, true, true, true, storeTermVector);
097: }
098:
099: /** Constructs a String-valued Field that is tokenized and indexed,
100: but that is not stored in the index. Term vector will not be stored for this field. */
101: public static final Field UnStored(String name, String value) {
102: return UnStored(name, value, false);
103: }
104:
105: /** Constructs a String-valued Field that is tokenized and indexed,
106: but that is not stored in the index. */
107: public static final Field UnStored(String name, String value,
108: boolean storeTermVector) {
109: return new Field(name, value, false, true, true,
110: storeTermVector);
111: }
112:
113: /** Constructs a Reader-valued Field that is tokenized and indexed, but is
114: not stored in the index verbatim. Useful for longer text fields, like
115: "body". Term vector will not be stored for this field. */
116: public static final Field Text(String name, Reader value) {
117: return Text(name, value, false);
118: }
119:
120: /** Constructs a Reader-valued Field that is tokenized and indexed, but is
121: not stored in the index verbatim. Useful for longer text fields, like
122: "body". */
123: public static final Field Text(String name, Reader value,
124: boolean storeTermVector) {
125: Field f = new Field(name, value);
126: f.storeTermVector = storeTermVector;
127: return f;
128: }
129:
130: /** The name of the field (e.g., "date", "subject", "title", or "body")
131: as an interned string. */
132: public String name() {
133: return name;
134: }
135:
136: /** The value of the field as a String, or null. If null, the Reader value
137: is used. Exactly one of stringValue() and readerValue() must be set. */
138: public String stringValue() {
139: return stringValue;
140: }
141:
142: /** The value of the field as a Reader, or null. If null, the String value
143: is used. Exactly one of stringValue() and readerValue() must be set. */
144: public Reader readerValue() {
145: return readerValue;
146: }
147:
148: /** Create a field by specifying all parameters except for <code>storeTermVector</code>,
149: * which is set to <code>false</code>.
150: */
151: public Field(String name, String string, boolean store,
152: boolean index, boolean token) {
153: this (name, string, store, index, token, false);
154: }
155:
156: /**
157: *
158: * @param name The name of the field
159: * @param string The string to process
160: * @param store true if the field should store the string
161: * @param index true if the field should be indexed
162: * @param token true if the field should be tokenized
163: * @param storeTermVector true if we should store the Term Vector info
164: */
165: public Field(String name, String string, boolean store,
166: boolean index, boolean token, boolean storeTermVector) {
167: if (name == null)
168: throw new IllegalArgumentException("name cannot be null");
169: if (string == null)
170: throw new IllegalArgumentException("value cannot be null");
171: if (!index && storeTermVector)
172: throw new IllegalArgumentException(
173: "cannot store a term vector for fields that are not indexed.");
174:
175: this .name = name.intern(); // field names are interned
176: this .stringValue = string;
177: this .isStored = store;
178: this .isIndexed = index;
179: this .isTokenized = token;
180: this .storeTermVector = storeTermVector;
181: }
182:
183: Field(String name, Reader reader) {
184: if (name == null)
185: throw new IllegalArgumentException("name cannot be null");
186: if (reader == null)
187: throw new IllegalArgumentException("value cannot be null");
188:
189: this .name = name.intern(); // field names are interned
190: this .readerValue = reader;
191: }
192:
193: /** True iff the value of the field is to be stored in the index for return
194: with search hits. It is an error for this to be true if a field is
195: Reader-valued. */
196: public final boolean isStored() {
197: return isStored;
198: }
199:
200: /** True iff the value of the field is to be indexed, so that it may be
201: searched on. */
202: public final boolean isIndexed() {
203: return isIndexed;
204: }
205:
206: /** True iff the value of the field should be tokenized as text prior to
207: indexing. Un-tokenized fields are indexed as a single word and may not be
208: Reader-valued. */
209: public final boolean isTokenized() {
210: return isTokenized;
211: }
212:
213: /** True iff the term or terms used to index this field are stored as a term
214: * vector, available from {@link IndexReader#getTermFreqVector(int,String)}.
215: * These methods do not provide access to the original content of the field,
216: * only to terms used to index it. If the original content must be
217: * preserved, use the <code>stored</code> attribute instead.
218: *
219: * @see IndexReader#getTermFreqVector(int, String)
220: */
221: public final boolean isTermVectorStored() {
222: return storeTermVector;
223: }
224:
225: /** Prints a Field for human consumption. */
226: public final String toString() {
227: if (isStored && isIndexed && !isTokenized)
228: return "Keyword<" + name + ":" + stringValue + ">";
229: else if (isStored && !isIndexed && !isTokenized)
230: return "Unindexed<" + name + ":" + stringValue + ">";
231: else if (isStored && isIndexed && isTokenized
232: && stringValue != null)
233: return "Text<" + name + ":" + stringValue + ">";
234: else if (!isStored && isIndexed && isTokenized
235: && readerValue != null)
236: return "Text<" + name + ":" + readerValue + ">";
237: else if (!isStored && isIndexed && isTokenized) {
238: return "UnStored<" + name + ">";
239: } else {
240: return super.toString();
241: }
242: }
243:
244: }
|