001: package org.apache.lucene.index;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Analyzer;
021: import org.apache.lucene.analysis.Token;
022: import org.apache.lucene.analysis.TokenStream;
023: import org.apache.lucene.document.Document;
024: import org.apache.lucene.document.Field;
025: import org.apache.lucene.store.MockRAMDirectory;
026: import org.apache.lucene.util.LuceneTestCase;
027:
028: import java.io.IOException;
029: import java.io.Reader;
030: import java.util.Arrays;
031: import java.util.Iterator;
032: import java.util.Map;
033: import java.util.SortedSet;
034:
035: public class TestTermVectorsReader extends LuceneTestCase {
036: //Must be lexicographically sorted, will do in setup, versus trying to maintain here
037: private String[] testFields = { "f1", "f2", "f3", "f4" };
038: private boolean[] testFieldsStorePos = { true, false, true, false };
039: private boolean[] testFieldsStoreOff = { true, false, false, true };
040: private String[] testTerms = { "this", "is", "a", "test" };
041: private int[][] positions = new int[testTerms.length][];
042: private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
043: private MockRAMDirectory dir = new MockRAMDirectory();
044: private String seg;
045: private FieldInfos fieldInfos = new FieldInfos();
046: private static int TERM_FREQ = 3;
047:
048: public TestTermVectorsReader(String s) {
049: super (s);
050: }
051:
052: private class TestToken implements Comparable {
053: String text;
054: int pos;
055: int startOffset;
056: int endOffset;
057:
058: public int compareTo(Object other) {
059: return pos - ((TestToken) other).pos;
060: }
061: }
062:
063: TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];
064:
065: protected void setUp() throws Exception {
066: super .setUp();
067: /*
068: for (int i = 0; i < testFields.length; i++) {
069: fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
070: }
071: */
072:
073: Arrays.sort(testTerms);
074: int tokenUpto = 0;
075: for (int i = 0; i < testTerms.length; i++) {
076: positions[i] = new int[TERM_FREQ];
077: offsets[i] = new TermVectorOffsetInfo[TERM_FREQ];
078: // first position must be 0
079: for (int j = 0; j < TERM_FREQ; j++) {
080: // positions are always sorted in increasing order
081: positions[i][j] = (int) (j * 10 + Math.random() * 10);
082: // offsets are always sorted in increasing order
083: offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10
084: + testTerms[i].length());
085: TestToken token = tokens[tokenUpto++] = new TestToken();
086: token.text = testTerms[i];
087: token.pos = positions[i][j];
088: token.startOffset = offsets[i][j].getStartOffset();
089: token.endOffset = offsets[i][j].getEndOffset();
090: }
091: }
092: Arrays.sort(tokens);
093:
094: IndexWriter writer = new IndexWriter(dir, new MyAnalyzer(),
095: true);
096: writer.setUseCompoundFile(false);
097: Document doc = new Document();
098: for (int i = 0; i < testFields.length; i++) {
099: final Field.TermVector tv;
100: if (testFieldsStorePos[i] && testFieldsStoreOff[i])
101: tv = Field.TermVector.WITH_POSITIONS_OFFSETS;
102: else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
103: tv = Field.TermVector.WITH_POSITIONS;
104: else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
105: tv = Field.TermVector.WITH_OFFSETS;
106: else
107: tv = Field.TermVector.YES;
108: doc.add(new Field(testFields[i], "", Field.Store.NO,
109: Field.Index.TOKENIZED, tv));
110: }
111:
112: //Create 5 documents for testing, they all have the same
113: //terms
114: for (int j = 0; j < 5; j++)
115: writer.addDocument(doc);
116: writer.flush();
117: seg = writer.newestSegment().name;
118: writer.close();
119:
120: fieldInfos = new FieldInfos(dir, seg + "."
121: + IndexFileNames.FIELD_INFOS_EXTENSION);
122: }
123:
124: private class MyTokenStream extends TokenStream {
125: int tokenUpto;
126:
127: public Token next() {
128: if (tokenUpto >= tokens.length)
129: return null;
130: else {
131: final Token t = new Token();
132: final TestToken testToken = tokens[tokenUpto++];
133: t.setTermText(testToken.text);
134: if (tokenUpto > 1)
135: t.setPositionIncrement(testToken.pos
136: - tokens[tokenUpto - 2].pos);
137: else
138: t.setPositionIncrement(testToken.pos + 1);
139: t.setStartOffset(testToken.startOffset);
140: t.setEndOffset(testToken.endOffset);
141: return t;
142: }
143: }
144: }
145:
146: private class MyAnalyzer extends Analyzer {
147: public TokenStream tokenStream(String fieldName, Reader reader) {
148: return new MyTokenStream();
149: }
150: }
151:
152: public void test() {
153: //Check to see the files were created properly in setup
154: assertTrue(dir.fileExists(seg + "."
155: + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION));
156: assertTrue(dir.fileExists(seg + "."
157: + IndexFileNames.VECTORS_INDEX_EXTENSION));
158: }
159:
160: public void testReader() throws IOException {
161: TermVectorsReader reader = new TermVectorsReader(dir, seg,
162: fieldInfos);
163: assertTrue(reader != null);
164: for (int j = 0; j < 5; j++) {
165: TermFreqVector vector = reader.get(j, testFields[0]);
166: assertTrue(vector != null);
167: String[] terms = vector.getTerms();
168: assertTrue(terms != null);
169: assertTrue(terms.length == testTerms.length);
170: for (int i = 0; i < terms.length; i++) {
171: String term = terms[i];
172: //System.out.println("Term: " + term);
173: assertTrue(term.equals(testTerms[i]));
174: }
175: }
176: }
177:
178: public void testPositionReader() throws IOException {
179: TermVectorsReader reader = new TermVectorsReader(dir, seg,
180: fieldInfos);
181: assertTrue(reader != null);
182: TermPositionVector vector;
183: String[] terms;
184: vector = (TermPositionVector) reader.get(0, testFields[0]);
185: assertTrue(vector != null);
186: terms = vector.getTerms();
187: assertTrue(terms != null);
188: assertTrue(terms.length == testTerms.length);
189: for (int i = 0; i < terms.length; i++) {
190: String term = terms[i];
191: //System.out.println("Term: " + term);
192: assertTrue(term.equals(testTerms[i]));
193: int[] positions = vector.getTermPositions(i);
194: assertTrue(positions != null);
195: assertTrue(positions.length == this .positions[i].length);
196: for (int j = 0; j < positions.length; j++) {
197: int position = positions[j];
198: assertTrue(position == this .positions[i][j]);
199: }
200: TermVectorOffsetInfo[] offset = vector.getOffsets(i);
201: assertTrue(offset != null);
202: assertTrue(offset.length == this .offsets[i].length);
203: for (int j = 0; j < offset.length; j++) {
204: TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
205: assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
206: }
207: }
208:
209: TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
210: assertTrue(freqVector != null);
211: assertTrue(freqVector instanceof TermPositionVector == false);
212: terms = freqVector.getTerms();
213: assertTrue(terms != null);
214: assertTrue(terms.length == testTerms.length);
215: for (int i = 0; i < terms.length; i++) {
216: String term = terms[i];
217: //System.out.println("Term: " + term);
218: assertTrue(term.equals(testTerms[i]));
219: }
220: }
221:
222: public void testOffsetReader() throws IOException {
223: TermVectorsReader reader = new TermVectorsReader(dir, seg,
224: fieldInfos);
225: assertTrue(reader != null);
226: TermPositionVector vector = (TermPositionVector) reader.get(0,
227: testFields[0]);
228: assertTrue(vector != null);
229: String[] terms = vector.getTerms();
230: assertTrue(terms != null);
231: assertTrue(terms.length == testTerms.length);
232: for (int i = 0; i < terms.length; i++) {
233: String term = terms[i];
234: //System.out.println("Term: " + term);
235: assertTrue(term.equals(testTerms[i]));
236: int[] positions = vector.getTermPositions(i);
237: assertTrue(positions != null);
238: assertTrue(positions.length == this .positions[i].length);
239: for (int j = 0; j < positions.length; j++) {
240: int position = positions[j];
241: assertTrue(position == this .positions[i][j]);
242: }
243: TermVectorOffsetInfo[] offset = vector.getOffsets(i);
244: assertTrue(offset != null);
245: assertTrue(offset.length == this .offsets[i].length);
246: for (int j = 0; j < offset.length; j++) {
247: TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
248: assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
249: }
250: }
251: }
252:
253: public void testMapper() throws IOException {
254: TermVectorsReader reader = new TermVectorsReader(dir, seg,
255: fieldInfos);
256: assertTrue(reader != null);
257: SortedTermVectorMapper mapper = new SortedTermVectorMapper(
258: new TermVectorEntryFreqSortedComparator());
259: reader.get(0, mapper);
260: SortedSet set = mapper.getTermVectorEntrySet();
261: assertTrue("set is null and it shouldn't be", set != null);
262: //three fields, 4 terms, all terms are the same
263: assertTrue("set Size: " + set.size() + " is not: " + 4, set
264: .size() == 4);
265: //Check offsets and positions
266: for (Iterator iterator = set.iterator(); iterator.hasNext();) {
267: TermVectorEntry tve = (TermVectorEntry) iterator.next();
268: assertTrue("tve is null and it shouldn't be", tve != null);
269: assertTrue("tve.getOffsets() is null and it shouldn't be",
270: tve.getOffsets() != null);
271: assertTrue(
272: "tve.getPositions() is null and it shouldn't be",
273: tve.getPositions() != null);
274:
275: }
276:
277: mapper = new SortedTermVectorMapper(
278: new TermVectorEntryFreqSortedComparator());
279: reader.get(1, mapper);
280: set = mapper.getTermVectorEntrySet();
281: assertTrue("set is null and it shouldn't be", set != null);
282: //three fields, 4 terms, all terms are the same
283: assertTrue("set Size: " + set.size() + " is not: " + 4, set
284: .size() == 4);
285: //Should have offsets and positions b/c we are munging all the fields together
286: for (Iterator iterator = set.iterator(); iterator.hasNext();) {
287: TermVectorEntry tve = (TermVectorEntry) iterator.next();
288: assertTrue("tve is null and it shouldn't be", tve != null);
289: assertTrue("tve.getOffsets() is null and it shouldn't be",
290: tve.getOffsets() != null);
291: assertTrue(
292: "tve.getPositions() is null and it shouldn't be",
293: tve.getPositions() != null);
294:
295: }
296:
297: FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(
298: new TermVectorEntryFreqSortedComparator());
299: reader.get(0, fsMapper);
300: Map map = fsMapper.getFieldToTerms();
301: assertTrue("map Size: " + map.size() + " is not: "
302: + testFields.length, map.size() == testFields.length);
303: for (Iterator iterator = map.entrySet().iterator(); iterator
304: .hasNext();) {
305: Map.Entry entry = (Map.Entry) iterator.next();
306: SortedSet sortedSet = (SortedSet) entry.getValue();
307: assertTrue("sortedSet Size: " + sortedSet.size()
308: + " is not: " + 4, sortedSet.size() == 4);
309: for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
310: TermVectorEntry tve = (TermVectorEntry) inner.next();
311: assertTrue("tve is null and it shouldn't be",
312: tve != null);
313: //Check offsets and positions.
314: assertTrue("tve is null and it shouldn't be",
315: tve != null);
316: String field = tve.getField();
317: if (field.equals(testFields[0])) {
318: //should have offsets
319:
320: assertTrue(
321: "tve.getOffsets() is null and it shouldn't be",
322: tve.getOffsets() != null);
323: assertTrue(
324: "tve.getPositions() is null and it shouldn't be",
325: tve.getPositions() != null);
326: } else if (field.equals(testFields[1])) {
327: //should not have offsets
328:
329: assertTrue(
330: "tve.getOffsets() is not null and it shouldn't be",
331: tve.getOffsets() == null);
332: assertTrue(
333: "tve.getPositions() is not null and it shouldn't be",
334: tve.getPositions() == null);
335: }
336: }
337: }
338: //Try mapper that ignores offs and positions
339: fsMapper = new FieldSortedTermVectorMapper(true, true,
340: new TermVectorEntryFreqSortedComparator());
341: reader.get(0, fsMapper);
342: map = fsMapper.getFieldToTerms();
343: assertTrue("map Size: " + map.size() + " is not: "
344: + testFields.length, map.size() == testFields.length);
345: for (Iterator iterator = map.entrySet().iterator(); iterator
346: .hasNext();) {
347: Map.Entry entry = (Map.Entry) iterator.next();
348: SortedSet sortedSet = (SortedSet) entry.getValue();
349: assertTrue("sortedSet Size: " + sortedSet.size()
350: + " is not: " + 4, sortedSet.size() == 4);
351: for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
352: TermVectorEntry tve = (TermVectorEntry) inner.next();
353: assertTrue("tve is null and it shouldn't be",
354: tve != null);
355: //Check offsets and positions.
356: assertTrue("tve is null and it shouldn't be",
357: tve != null);
358: String field = tve.getField();
359: if (field.equals(testFields[0])) {
360: //should have offsets
361:
362: assertTrue(
363: "tve.getOffsets() is null and it shouldn't be",
364: tve.getOffsets() == null);
365: assertTrue(
366: "tve.getPositions() is null and it shouldn't be",
367: tve.getPositions() == null);
368: } else if (field.equals(testFields[1])) {
369: //should not have offsets
370:
371: assertTrue(
372: "tve.getOffsets() is not null and it shouldn't be",
373: tve.getOffsets() == null);
374: assertTrue(
375: "tve.getPositions() is not null and it shouldn't be",
376: tve.getPositions() == null);
377: }
378: }
379: }
380:
381: // test setDocumentNumber()
382: IndexReader ir = IndexReader.open(dir);
383: DocNumAwareMapper docNumAwareMapper = new DocNumAwareMapper();
384: assertEquals(-1, docNumAwareMapper.getDocumentNumber());
385:
386: ir.getTermFreqVector(0, docNumAwareMapper);
387: assertEquals(0, docNumAwareMapper.getDocumentNumber());
388: docNumAwareMapper.setDocumentNumber(-1);
389:
390: ir.getTermFreqVector(1, docNumAwareMapper);
391: assertEquals(1, docNumAwareMapper.getDocumentNumber());
392: docNumAwareMapper.setDocumentNumber(-1);
393:
394: ir.getTermFreqVector(0, "f1", docNumAwareMapper);
395: assertEquals(0, docNumAwareMapper.getDocumentNumber());
396: docNumAwareMapper.setDocumentNumber(-1);
397:
398: ir.getTermFreqVector(1, "f2", docNumAwareMapper);
399: assertEquals(1, docNumAwareMapper.getDocumentNumber());
400: docNumAwareMapper.setDocumentNumber(-1);
401:
402: ir.getTermFreqVector(0, "f1", docNumAwareMapper);
403: assertEquals(0, docNumAwareMapper.getDocumentNumber());
404:
405: ir.close();
406:
407: }
408:
409: /**
410: * Make sure exceptions and bad params are handled appropriately
411: */
412: public void testBadParams() {
413: try {
414: TermVectorsReader reader = new TermVectorsReader(dir, seg,
415: fieldInfos);
416: assertTrue(reader != null);
417: //Bad document number, good field number
418: reader.get(50, testFields[0]);
419: fail();
420: } catch (IOException e) {
421: // expected exception
422: }
423: try {
424: TermVectorsReader reader = new TermVectorsReader(dir, seg,
425: fieldInfos);
426: assertTrue(reader != null);
427: //Bad document number, no field
428: reader.get(50);
429: fail();
430: } catch (IOException e) {
431: // expected exception
432: }
433: try {
434: TermVectorsReader reader = new TermVectorsReader(dir, seg,
435: fieldInfos);
436: assertTrue(reader != null);
437: //good document number, bad field number
438: TermFreqVector vector = reader.get(0, "f50");
439: assertTrue(vector == null);
440: } catch (IOException e) {
441: fail();
442: }
443: }
444:
445: public static class DocNumAwareMapper extends TermVectorMapper {
446:
447: public DocNumAwareMapper() {
448: }
449:
450: private int documentNumber = -1;
451:
452: public void setExpectations(String field, int numTerms,
453: boolean storeOffsets, boolean storePositions) {
454: if (documentNumber == -1) {
455: throw new RuntimeException(
456: "Documentnumber should be set at this point!");
457: }
458: }
459:
460: public void map(String term, int frequency,
461: TermVectorOffsetInfo[] offsets, int[] positions) {
462: if (documentNumber == -1) {
463: throw new RuntimeException(
464: "Documentnumber should be set at this point!");
465: }
466: }
467:
468: public int getDocumentNumber() {
469: return documentNumber;
470: }
471:
472: public void setDocumentNumber(int documentNumber) {
473: this.documentNumber = documentNumber;
474: }
475: }
476: }
|