001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.util.LuceneTestCase;
021: import org.apache.lucene.analysis.SimpleAnalyzer;
022: import org.apache.lucene.document.Document;
023: import org.apache.lucene.document.Field;
024: import org.apache.lucene.index.*;
025: import org.apache.lucene.store.Directory;
026: import org.apache.lucene.store.RAMDirectory;
027: import org.apache.lucene.util.English;
028:
029: import java.io.IOException;
030: import java.util.HashMap;
031: import java.util.Iterator;
032: import java.util.Map;
033: import java.util.SortedSet;
034:
035: public class TestTermVectors extends LuceneTestCase {
036: private IndexSearcher searcher;
037: private RAMDirectory directory = new RAMDirectory();
038:
039: public TestTermVectors(String s) {
040: super (s);
041: }
042:
043: public void setUp() throws Exception {
044: super .setUp();
045: IndexWriter writer = new IndexWriter(directory,
046: new SimpleAnalyzer(), true);
047: //writer.setUseCompoundFile(true);
048: //writer.infoStream = System.out;
049: for (int i = 0; i < 1000; i++) {
050: Document doc = new Document();
051: Field.TermVector termVector;
052: int mod3 = i % 3;
053: int mod2 = i % 2;
054: if (mod2 == 0 && mod3 == 0) {
055: termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
056: } else if (mod2 == 0) {
057: termVector = Field.TermVector.WITH_POSITIONS;
058: } else if (mod3 == 0) {
059: termVector = Field.TermVector.WITH_OFFSETS;
060: } else {
061: termVector = Field.TermVector.YES;
062: }
063: doc
064: .add(new Field("field", English.intToEnglish(i),
065: Field.Store.YES, Field.Index.TOKENIZED,
066: termVector));
067: writer.addDocument(doc);
068: }
069: writer.close();
070: searcher = new IndexSearcher(directory);
071: }
072:
073: public void test() {
074: assertTrue(searcher != null);
075: }
076:
077: public void testTermVectors() {
078: Query query = new TermQuery(new Term("field", "seventy"));
079: try {
080: Hits hits = searcher.search(query);
081: assertEquals(100, hits.length());
082:
083: for (int i = 0; i < hits.length(); i++) {
084: TermFreqVector[] vector = searcher.reader
085: .getTermFreqVectors(hits.id(i));
086: assertTrue(vector != null);
087: assertTrue(vector.length == 1);
088: }
089: } catch (IOException e) {
090: assertTrue(false);
091: }
092: }
093:
094: public void testTermPositionVectors() {
095: Query query = new TermQuery(new Term("field", "zero"));
096: try {
097: Hits hits = searcher.search(query);
098: assertEquals(1, hits.length());
099:
100: for (int i = 0; i < hits.length(); i++) {
101: TermFreqVector[] vector = searcher.reader
102: .getTermFreqVectors(hits.id(i));
103: assertTrue(vector != null);
104: assertTrue(vector.length == 1);
105:
106: boolean shouldBePosVector = (hits.id(i) % 2 == 0) ? true
107: : false;
108: assertTrue((shouldBePosVector == false)
109: || (shouldBePosVector == true && (vector[0] instanceof TermPositionVector == true)));
110:
111: boolean shouldBeOffVector = (hits.id(i) % 3 == 0) ? true
112: : false;
113: assertTrue((shouldBeOffVector == false)
114: || (shouldBeOffVector == true && (vector[0] instanceof TermPositionVector == true)));
115:
116: if (shouldBePosVector || shouldBeOffVector) {
117: TermPositionVector posVec = (TermPositionVector) vector[0];
118: String[] terms = posVec.getTerms();
119: assertTrue(terms != null && terms.length > 0);
120:
121: for (int j = 0; j < terms.length; j++) {
122: int[] positions = posVec.getTermPositions(j);
123: TermVectorOffsetInfo[] offsets = posVec
124: .getOffsets(j);
125:
126: if (shouldBePosVector) {
127: assertTrue(positions != null);
128: assertTrue(positions.length > 0);
129: } else
130: assertTrue(positions == null);
131:
132: if (shouldBeOffVector) {
133: assertTrue(offsets != null);
134: assertTrue(offsets.length > 0);
135: } else
136: assertTrue(offsets == null);
137: }
138: } else {
139: try {
140: TermPositionVector posVec = (TermPositionVector) vector[0];
141: assertTrue(false);
142: } catch (ClassCastException ignore) {
143: TermFreqVector freqVec = vector[0];
144: String[] terms = freqVec.getTerms();
145: assertTrue(terms != null && terms.length > 0);
146: }
147:
148: }
149:
150: }
151: } catch (IOException e) {
152: assertTrue(false);
153: }
154: }
155:
156: public void testTermOffsetVectors() {
157: Query query = new TermQuery(new Term("field", "fifty"));
158: try {
159: Hits hits = searcher.search(query);
160: assertEquals(100, hits.length());
161:
162: for (int i = 0; i < hits.length(); i++) {
163: TermFreqVector[] vector = searcher.reader
164: .getTermFreqVectors(hits.id(i));
165: assertTrue(vector != null);
166: assertTrue(vector.length == 1);
167:
168: //assertTrue();
169: }
170: } catch (IOException e) {
171: assertTrue(false);
172: }
173: }
174:
175: public void testKnownSetOfDocuments() {
176: String test1 = "eating chocolate in a computer lab"; //6 terms
177: String test2 = "computer in a computer lab"; //5 terms
178: String test3 = "a chocolate lab grows old"; //5 terms
179: String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
180: Map test4Map = new HashMap();
181: test4Map.put("chocolate", new Integer(3));
182: test4Map.put("lab", new Integer(2));
183: test4Map.put("eating", new Integer(1));
184: test4Map.put("computer", new Integer(1));
185: test4Map.put("with", new Integer(1));
186: test4Map.put("a", new Integer(1));
187: test4Map.put("colored", new Integer(1));
188: test4Map.put("in", new Integer(1));
189: test4Map.put("an", new Integer(1));
190: test4Map.put("computer", new Integer(1));
191: test4Map.put("old", new Integer(1));
192:
193: Document testDoc1 = new Document();
194: setupDoc(testDoc1, test1);
195: Document testDoc2 = new Document();
196: setupDoc(testDoc2, test2);
197: Document testDoc3 = new Document();
198: setupDoc(testDoc3, test3);
199: Document testDoc4 = new Document();
200: setupDoc(testDoc4, test4);
201:
202: Directory dir = new RAMDirectory();
203:
204: try {
205: IndexWriter writer = new IndexWriter(dir,
206: new SimpleAnalyzer(), true);
207: assertTrue(writer != null);
208: writer.addDocument(testDoc1);
209: writer.addDocument(testDoc2);
210: writer.addDocument(testDoc3);
211: writer.addDocument(testDoc4);
212: writer.close();
213: IndexSearcher knownSearcher = new IndexSearcher(dir);
214: TermEnum termEnum = knownSearcher.reader.terms();
215: TermDocs termDocs = knownSearcher.reader.termDocs();
216: //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);
217:
218: Similarity sim = knownSearcher.getSimilarity();
219: while (termEnum.next() == true) {
220: Term term = termEnum.term();
221: //System.out.println("Term: " + term);
222: termDocs.seek(term);
223: while (termDocs.next()) {
224: int docId = termDocs.doc();
225: int freq = termDocs.freq();
226: //System.out.println("Doc Id: " + docId + " freq " + freq);
227: TermFreqVector vector = knownSearcher.reader
228: .getTermFreqVector(docId, "field");
229: float tf = sim.tf(freq);
230: float idf = sim.idf(term, knownSearcher);
231: //float qNorm = sim.queryNorm()
232: //This is fine since we don't have stop words
233: float lNorm = sim.lengthNorm("field", vector
234: .getTerms().length);
235: //float coord = sim.coord()
236: //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
237: assertTrue(vector != null);
238: String[] vTerms = vector.getTerms();
239: int[] freqs = vector.getTermFrequencies();
240: for (int i = 0; i < vTerms.length; i++) {
241: if (term.text().equals(vTerms[i])) {
242: assertTrue(freqs[i] == freq);
243: }
244: }
245:
246: }
247: //System.out.println("--------");
248: }
249: Query query = new TermQuery(new Term("field", "chocolate"));
250: Hits hits = knownSearcher.search(query);
251: //doc 3 should be the first hit b/c it is the shortest match
252: assertTrue(hits.length() == 3);
253: float score = hits.score(0);
254: /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
255: System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
256: System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
257: System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
258: System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
259: System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
260: assertTrue(hits.id(0) == 2);
261: assertTrue(hits.id(1) == 3);
262: assertTrue(hits.id(2) == 0);
263: TermFreqVector vector = knownSearcher.reader
264: .getTermFreqVector(hits.id(1), "field");
265: assertTrue(vector != null);
266: //System.out.println("Vector: " + vector);
267: String[] terms = vector.getTerms();
268: int[] freqs = vector.getTermFrequencies();
269: assertTrue(terms != null && terms.length == 10);
270: for (int i = 0; i < terms.length; i++) {
271: String term = terms[i];
272: //System.out.println("Term: " + term);
273: int freq = freqs[i];
274: assertTrue(test4.indexOf(term) != -1);
275: Integer freqInt = (Integer) test4Map.get(term);
276: assertTrue(freqInt != null);
277: assertTrue(freqInt.intValue() == freq);
278: }
279: SortedTermVectorMapper mapper = new SortedTermVectorMapper(
280: new TermVectorEntryFreqSortedComparator());
281: knownSearcher.reader.getTermFreqVector(hits.id(1), mapper);
282: SortedSet vectorEntrySet = mapper.getTermVectorEntrySet();
283: assertTrue("mapper.getTermVectorEntrySet() Size: "
284: + vectorEntrySet.size() + " is not: " + 10,
285: vectorEntrySet.size() == 10);
286: TermVectorEntry last = null;
287: for (Iterator iterator = vectorEntrySet.iterator(); iterator
288: .hasNext();) {
289: TermVectorEntry tve = (TermVectorEntry) iterator.next();
290: if (tve != null && last != null) {
291: assertTrue("terms are not properly sorted", last
292: .getFrequency() >= tve.getFrequency());
293: Integer expectedFreq = (Integer) test4Map.get(tve
294: .getTerm());
295: //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
296: assertTrue("Frequency is not correct:", tve
297: .getFrequency() == 2 * expectedFreq
298: .intValue());
299: }
300: last = tve;
301:
302: }
303:
304: FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(
305: new TermVectorEntryFreqSortedComparator());
306: knownSearcher.reader.getTermFreqVector(hits.id(1),
307: fieldMapper);
308: Map map = fieldMapper.getFieldToTerms();
309: assertTrue("map Size: " + map.size() + " is not: " + 2, map
310: .size() == 2);
311: vectorEntrySet = (SortedSet) map.get("field");
312: assertTrue("vectorEntrySet is null and it shouldn't be",
313: vectorEntrySet != null);
314: assertTrue("vectorEntrySet Size: " + vectorEntrySet.size()
315: + " is not: " + 10, vectorEntrySet.size() == 10);
316: knownSearcher.close();
317: } catch (IOException e) {
318: e.printStackTrace();
319: assertTrue(false);
320: }
321: }
322:
323: private void setupDoc(Document doc, String text) {
324: doc.add(new Field("field", text, Field.Store.YES,
325: Field.Index.TOKENIZED, Field.TermVector.YES));
326: doc.add(new Field("field2", text, Field.Store.YES,
327: Field.Index.TOKENIZED,
328: Field.TermVector.WITH_POSITIONS_OFFSETS));
329: //System.out.println("Document: " + doc);
330: }
331:
332: // Test only a few docs having vectors
333: public void testRareVectors() throws IOException {
334: IndexWriter writer = new IndexWriter(directory,
335: new SimpleAnalyzer(), true);
336: for (int i = 0; i < 100; i++) {
337: Document doc = new Document();
338: doc.add(new Field("field", English.intToEnglish(i),
339: Field.Store.YES, Field.Index.TOKENIZED,
340: Field.TermVector.NO));
341: writer.addDocument(doc);
342: }
343: for (int i = 0; i < 10; i++) {
344: Document doc = new Document();
345: doc.add(new Field("field", English.intToEnglish(100 + i),
346: Field.Store.YES, Field.Index.TOKENIZED,
347: Field.TermVector.WITH_POSITIONS_OFFSETS));
348: writer.addDocument(doc);
349: }
350:
351: writer.close();
352: searcher = new IndexSearcher(directory);
353:
354: Query query = new TermQuery(new Term("field", "hundred"));
355: Hits hits = searcher.search(query);
356: assertEquals(10, hits.length());
357: for (int i = 0; i < hits.length(); i++) {
358: TermFreqVector[] vector = searcher.reader
359: .getTermFreqVectors(hits.id(i));
360: assertTrue(vector != null);
361: assertTrue(vector.length == 1);
362: }
363: }
364:
365: // In a single doc, for the same field, mix the term
366: // vectors up
367: public void testMixedVectrosVectors() throws IOException {
368: IndexWriter writer = new IndexWriter(directory,
369: new SimpleAnalyzer(), true);
370: Document doc = new Document();
371: doc.add(new Field("field", "one", Field.Store.YES,
372: Field.Index.TOKENIZED, Field.TermVector.NO));
373: doc.add(new Field("field", "one", Field.Store.YES,
374: Field.Index.TOKENIZED, Field.TermVector.YES));
375: doc
376: .add(new Field("field", "one", Field.Store.YES,
377: Field.Index.TOKENIZED,
378: Field.TermVector.WITH_POSITIONS));
379: doc.add(new Field("field", "one", Field.Store.YES,
380: Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS));
381: doc.add(new Field("field", "one", Field.Store.YES,
382: Field.Index.TOKENIZED,
383: Field.TermVector.WITH_POSITIONS_OFFSETS));
384: writer.addDocument(doc);
385: writer.close();
386:
387: searcher = new IndexSearcher(directory);
388:
389: Query query = new TermQuery(new Term("field", "one"));
390: Hits hits = searcher.search(query);
391: assertEquals(1, hits.length());
392:
393: TermFreqVector[] vector = searcher.reader
394: .getTermFreqVectors(hits.id(0));
395: assertTrue(vector != null);
396: assertTrue(vector.length == 1);
397: TermPositionVector tfv = (TermPositionVector) vector[0];
398: assertTrue(tfv.getField().equals("field"));
399: String[] terms = tfv.getTerms();
400: assertEquals(1, terms.length);
401: assertEquals(terms[0], "one");
402: assertEquals(5, tfv.getTermFrequencies()[0]);
403:
404: int[] positions = tfv.getTermPositions(0);
405: assertEquals(5, positions.length);
406: for (int i = 0; i < 5; i++)
407: assertEquals(i, positions[i]);
408: TermVectorOffsetInfo[] offsets = tfv.getOffsets(0);
409: assertEquals(5, offsets.length);
410: for (int i = 0; i < 5; i++) {
411: assertEquals(4 * i, offsets[i].getStartOffset());
412: assertEquals(4 * i + 3, offsets[i].getEndOffset());
413: }
414: }
415: }
|