001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.*;
021: import org.apache.lucene.analysis.standard.StandardAnalyzer;
022: import org.apache.lucene.document.Document;
023: import org.apache.lucene.document.Field;
024: import org.apache.lucene.document.Field.Index;
025: import org.apache.lucene.document.Field.Store;
026: import org.apache.lucene.document.Field.TermVector;
027: import org.apache.lucene.document.Fieldable;
028: import org.apache.lucene.store.RAMDirectory;
029: import org.apache.lucene.util.LuceneTestCase;
030:
031: import java.io.IOException;
032: import java.io.Reader;
033:
034: public class TestDocumentWriter extends LuceneTestCase {
035: private RAMDirectory dir;
036:
037: public TestDocumentWriter(String s) {
038: super (s);
039: }
040:
041: protected void setUp() throws Exception {
042: super .setUp();
043: dir = new RAMDirectory();
044: }
045:
046: public void test() {
047: assertTrue(dir != null);
048: }
049:
050: public void testAddDocument() throws Exception {
051: Document testDoc = new Document();
052: DocHelper.setupDoc(testDoc);
053: Analyzer analyzer = new WhitespaceAnalyzer();
054: IndexWriter writer = new IndexWriter(dir, analyzer, true);
055: writer.addDocument(testDoc);
056: writer.flush();
057: SegmentInfo info = writer.newestSegment();
058: writer.close();
059: //After adding the document, we should be able to read it back in
060: SegmentReader reader = SegmentReader.get(info);
061: assertTrue(reader != null);
062: Document doc = reader.document(0);
063: assertTrue(doc != null);
064:
065: //System.out.println("Document: " + doc);
066: Fieldable[] fields = doc.getFields("textField2");
067: assertTrue(fields != null && fields.length == 1);
068: assertTrue(fields[0].stringValue().equals(
069: DocHelper.FIELD_2_TEXT));
070: assertTrue(fields[0].isTermVectorStored());
071:
072: fields = doc.getFields("textField1");
073: assertTrue(fields != null && fields.length == 1);
074: assertTrue(fields[0].stringValue().equals(
075: DocHelper.FIELD_1_TEXT));
076: assertFalse(fields[0].isTermVectorStored());
077:
078: fields = doc.getFields("keyField");
079: assertTrue(fields != null && fields.length == 1);
080: assertTrue(fields[0].stringValue().equals(
081: DocHelper.KEYWORD_TEXT));
082:
083: fields = doc.getFields(DocHelper.NO_NORMS_KEY);
084: assertTrue(fields != null && fields.length == 1);
085: assertTrue(fields[0].stringValue().equals(
086: DocHelper.NO_NORMS_TEXT));
087:
088: fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
089: assertTrue(fields != null && fields.length == 1);
090: assertTrue(fields[0].stringValue().equals(
091: DocHelper.FIELD_3_TEXT));
092:
093: // test that the norms are not present in the segment if
094: // omitNorms is true
095: for (int i = 0; i < reader.fieldInfos.size(); i++) {
096: FieldInfo fi = reader.fieldInfos.fieldInfo(i);
097: if (fi.isIndexed) {
098: assertTrue(fi.omitNorms == !reader.hasNorms(fi.name));
099: }
100: }
101: }
102:
103: public void testPositionIncrementGap() throws IOException {
104: Analyzer analyzer = new Analyzer() {
105: public TokenStream tokenStream(String fieldName,
106: Reader reader) {
107: return new WhitespaceTokenizer(reader);
108: }
109:
110: public int getPositionIncrementGap(String fieldName) {
111: return 500;
112: }
113: };
114:
115: IndexWriter writer = new IndexWriter(dir, analyzer, true);
116:
117: Document doc = new Document();
118: doc.add(new Field("repeated", "repeated one", Field.Store.YES,
119: Field.Index.TOKENIZED));
120: doc.add(new Field("repeated", "repeated two", Field.Store.YES,
121: Field.Index.TOKENIZED));
122:
123: writer.addDocument(doc);
124: writer.flush();
125: SegmentInfo info = writer.newestSegment();
126: writer.close();
127: SegmentReader reader = SegmentReader.get(info);
128:
129: TermPositions termPositions = reader.termPositions(new Term(
130: "repeated", "repeated"));
131: assertTrue(termPositions.next());
132: int freq = termPositions.freq();
133: assertEquals(2, freq);
134: assertEquals(0, termPositions.nextPosition());
135: assertEquals(502, termPositions.nextPosition());
136: }
137:
138: public void testTokenReuse() throws IOException {
139: Analyzer analyzer = new Analyzer() {
140: public TokenStream tokenStream(String fieldName,
141: Reader reader) {
142: return new TokenFilter(new WhitespaceTokenizer(reader)) {
143: boolean first = true;
144: Token buffered;
145:
146: public Token next() throws IOException {
147: return input.next();
148: }
149:
150: public Token next(Token result) throws IOException {
151: if (buffered != null) {
152: Token t = buffered;
153: buffered = null;
154: return t;
155: }
156: Token t = input.next(result);
157: if (t == null)
158: return null;
159: if (Character.isDigit(t.termBuffer()[0])) {
160: t
161: .setPositionIncrement(t
162: .termBuffer()[0] - '0');
163: }
164: if (first) {
165: // set payload on first position only
166: t
167: .setPayload(new Payload(
168: new byte[] { 100 }));
169: first = false;
170: }
171:
172: // index a "synonym" for every token
173: buffered = (Token) t.clone();
174: buffered.setPayload(null);
175: buffered.setPositionIncrement(0);
176: buffered
177: .setTermBuffer(new char[] { 'b' }, 0, 1);
178:
179: return t;
180: }
181: };
182: }
183: };
184:
185: IndexWriter writer = new IndexWriter(dir, analyzer, true);
186:
187: Document doc = new Document();
188: doc.add(new Field("f1", "a 5 a a", Field.Store.YES,
189: Field.Index.TOKENIZED));
190:
191: writer.addDocument(doc);
192: writer.flush();
193: SegmentInfo info = writer.newestSegment();
194: writer.close();
195: SegmentReader reader = SegmentReader.get(info);
196:
197: TermPositions termPositions = reader.termPositions(new Term(
198: "f1", "a"));
199: assertTrue(termPositions.next());
200: int freq = termPositions.freq();
201: assertEquals(3, freq);
202: assertEquals(0, termPositions.nextPosition());
203: assertEquals(true, termPositions.isPayloadAvailable());
204: assertEquals(6, termPositions.nextPosition());
205: assertEquals(false, termPositions.isPayloadAvailable());
206: assertEquals(7, termPositions.nextPosition());
207: assertEquals(false, termPositions.isPayloadAvailable());
208: }
209:
210: public void testPreAnalyzedField() throws IOException {
211: IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(),
212: true);
213: Document doc = new Document();
214:
215: doc.add(new Field("preanalyzed", new TokenStream() {
216: private String[] tokens = new String[] { "term1", "term2",
217: "term3", "term2" };
218: private int index = 0;
219:
220: public Token next() throws IOException {
221: if (index == tokens.length) {
222: return null;
223: } else {
224: return new Token(tokens[index++], 0, 0);
225: }
226: }
227:
228: }, TermVector.NO));
229:
230: writer.addDocument(doc);
231: writer.flush();
232: SegmentInfo info = writer.newestSegment();
233: writer.close();
234: SegmentReader reader = SegmentReader.get(info);
235:
236: TermPositions termPositions = reader.termPositions(new Term(
237: "preanalyzed", "term1"));
238: assertTrue(termPositions.next());
239: assertEquals(1, termPositions.freq());
240: assertEquals(0, termPositions.nextPosition());
241:
242: termPositions.seek(new Term("preanalyzed", "term2"));
243: assertTrue(termPositions.next());
244: assertEquals(2, termPositions.freq());
245: assertEquals(1, termPositions.nextPosition());
246: assertEquals(3, termPositions.nextPosition());
247:
248: termPositions.seek(new Term("preanalyzed", "term3"));
249: assertTrue(termPositions.next());
250: assertEquals(1, termPositions.freq());
251: assertEquals(2, termPositions.nextPosition());
252:
253: }
254:
255: /**
256: * Test adding two fields with the same name, but
257: * with different term vector setting (LUCENE-766).
258: */
259: public void testMixedTermVectorSettingsSameField() throws Exception {
260: Document doc = new Document();
261: // f1 first without tv then with tv
262: doc.add(new Field("f1", "v1", Store.YES, Index.UN_TOKENIZED,
263: TermVector.NO));
264: doc.add(new Field("f1", "v2", Store.YES, Index.UN_TOKENIZED,
265: TermVector.WITH_POSITIONS_OFFSETS));
266: // f2 first with tv then without tv
267: doc.add(new Field("f2", "v1", Store.YES, Index.UN_TOKENIZED,
268: TermVector.WITH_POSITIONS_OFFSETS));
269: doc.add(new Field("f2", "v2", Store.YES, Index.UN_TOKENIZED,
270: TermVector.NO));
271:
272: RAMDirectory ram = new RAMDirectory();
273: IndexWriter writer = new IndexWriter(ram,
274: new StandardAnalyzer(), true);
275: writer.addDocument(doc);
276: writer.close();
277:
278: IndexReader reader = IndexReader.open(ram);
279: // f1
280: TermFreqVector tfv1 = reader.getTermFreqVector(0, "f1");
281: assertNotNull(tfv1);
282: assertEquals("the 'with_tv' setting should rule!", 2, tfv1
283: .getTerms().length);
284: // f2
285: TermFreqVector tfv2 = reader.getTermFreqVector(0, "f2");
286: assertNotNull(tfv2);
287: assertEquals("the 'with_tv' setting should rule!", 2, tfv2
288: .getTerms().length);
289: }
290: }
|