001: /**
002: * Copyright 2004-2005 The Apache Software Foundation.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */package org.apache.lucene.search.similar;
016:
017: import org.apache.lucene.util.PriorityQueue;
018: import org.apache.lucene.index.IndexReader;
019: import org.apache.lucene.index.Term;
020: import org.apache.lucene.index.TermFreqVector;
021: import org.apache.lucene.search.BooleanClause;
022: import org.apache.lucene.search.DefaultSimilarity;
023: import org.apache.lucene.search.Similarity;
024: import org.apache.lucene.search.TermQuery;
025: import org.apache.lucene.search.BooleanQuery;
026: import org.apache.lucene.search.IndexSearcher;
027: import org.apache.lucene.search.Query;
028: import org.apache.lucene.search.Hits;
029: import org.apache.lucene.analysis.Analyzer;
030: import org.apache.lucene.analysis.TokenStream;
031: import org.apache.lucene.analysis.standard.StandardAnalyzer;
032: import org.apache.lucene.document.Document;
033:
034: import java.util.Set;
035: import java.util.HashMap;
036: import java.util.Map;
037: import java.util.Collection;
038: import java.util.Iterator;
039: import java.io.IOException;
040: import java.io.Reader;
041: import java.io.File;
042: import java.io.PrintStream;
043: import java.io.StringReader;
044: import java.io.FileReader;
045: import java.io.InputStreamReader;
046: import java.net.URL;
047: import java.util.ArrayList;
048:
049: /**
050: * Generate "more like this" similarity queries.
051: * Based on this mail:
052: * <code><pre>
053: * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
054: * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
055: * is usually fast enough. But looking up the docFreq() of every term in the document is
056: * probably too slow.
057: *
058: * You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
059: * or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
060: * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
061: * reduce the number of terms under consideration. Another heuristic is that terms with a
062: * high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
063: * number of characters, not selecting anything less than, e.g., six or seven characters.
064: * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
065: * that do a pretty good job of characterizing a document.
066: *
067: * It all depends on what you're trying to do. If you're trying to eek out that last percent
068: * of precision and recall regardless of computational difficulty so that you can win a TREC
069: * competition, then the techniques I mention above are useless. But if you're trying to
070: * provide a "more like this" button on a search results page that does a decent job and has
071: * good performance, such techniques might be useful.
072: *
073: * An efficient, effective "more-like-this" query generator would be a great contribution, if
074: * anyone's interested. I'd imagine that it would take a Reader or a String (the document's
075: * text), analyzer Analyzer, and return a set of representative terms using heuristics like those
076: * above. The frequency and length thresholds could be parameters, etc.
077: *
078: * Doug
079: * </pre></code>
080: *
081: *
082: * <p>
083: * <h3>Initial Usage</h3>
084: *
085: * This class has lots of options to try to make it efficient and flexible.
086: * See the body of {@link #main main()} below in the source for real code, or
087: * if you want pseudo code, the simpliest possible usage is as follows. The bold
088: * fragment is specific to this class.
089: *
090: * <code><pre>
091: *
092: * IndexReader ir = ...
093: * IndexSearcher is = ...
094: * <b>
095: * MoreLikeThis mlt = new MoreLikeThis(ir);
096: * Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b>
097: * Query query = mlt.like( target);
098: * </b>
099: * Hits hits = is.search(query);
100: * <em>// now the usual iteration thru 'hits' - the only thing to watch for is to make sure
101: * you ignore the doc if it matches your 'target' document, as it should be similar to itself </em>
102: *
103: * </pre></code>
104: *
105: * Thus you:
106: * <ol>
107: * <li> do your normal, Lucene setup for searching,
108: * <li> create a MoreLikeThis,
109: * <li> get the text of the doc you want to find similaries to
110: * <li> then call one of the like() calls to generate a similarity query
111: * <li> call the searcher to find the similar docs
112: * </ol>
113: *
114: * <h3>More Advanced Usage</h3>
115: *
116: * You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine
117: * multiple fields (e.g. body and title) for similarity.
118: * <p>
119: *
120: * Depending on the size of your index and the size and makeup of your documents you
121: * may want to call the other set methods to control how the similarity queries are
122: * generated:
123: * <ul>
124: * <li> {@link #setMinTermFreq setMinTermFreq(...)}
125: * <li> {@link #setMinDocFreq setMinDocFreq(...)}
126: * <li> {@link #setMinWordLen setMinWordLen(...)}
127: * <li> {@link #setMaxWordLen setMaxWordLen(...)}
128: * <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
129: * <li> {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)}
130: * <li> {@link #setStopWords setStopWord(...)}
131: * </ul>
132: *
133: * <hr>
134: * <pre>
135: * Changes: Mark Harwood 29/02/04
136: * Some bugfixing, some refactoring, some optimisation.
137: * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
138: * - bugfix: No significant terms being created for fields with a termvector - because
139: * was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector)
140: * - refactor: moved common code into isNoiseWord()
141: * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
142: * </pre>
143: *
144: * @author David Spencer
145: * @author Bruce Ritchie
146: * @author Mark Harwood
147: */
148: public final class MoreLikeThis {
149:
150: /**
151: * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
152: * @see #getMaxNumTokensParsed
153: */
154: public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
155:
156: /**
157: * Default analyzer to parse source doc with.
158: * @see #getAnalyzer
159: */
160: public static final Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
161:
162: /**
163: * Ignore terms with less than this frequency in the source doc.
164: * @see #getMinTermFreq
165: * @see #setMinTermFreq
166: */
167: public static final int DEFAULT_MIN_TERM_FREQ = 2;
168:
169: /**
170: * Ignore words which do not occur in at least this many docs.
171: * @see #getMinDocFreq
172: * @see #setMinDocFreq
173: */
174: public static final int DEFAULT_MIN_DOC_FREQ = 5;
175:
176: /**
177: * Boost terms in query based on score.
178: * @see #isBoost
179: * @see #setBoost
180: */
181: public static final boolean DEFAULT_BOOST = false;
182:
183: /**
184: * Default field names. Null is used to specify that the field names should be looked
185: * up at runtime from the provided reader.
186: */
187: public static final String[] DEFAULT_FIELD_NAMES = new String[] { "contents" };
188:
189: /**
190: * Ignore words less than this length or if 0 then this has no effect.
191: * @see #getMinWordLen
192: * @see #setMinWordLen
193: */
194: public static final int DEFAULT_MIN_WORD_LENGTH = 0;
195:
196: /**
197: * Ignore words greater than this length or if 0 then this has no effect.
198: * @see #getMaxWordLen
199: * @see #setMaxWordLen
200: */
201: public static final int DEFAULT_MAX_WORD_LENGTH = 0;
202:
203: /**
204: * Default set of stopwords.
205: * If null means to allow stop words.
206: *
207: * @see #setStopWords
208: * @see #getStopWords
209: */
210: public static final Set DEFAULT_STOP_WORDS = null;
211:
212: /**
213: * Current set of stop words.
214: */
215: private Set stopWords = DEFAULT_STOP_WORDS;
216:
217: /**
218: * Return a Query with no more than this many terms.
219: *
220: * @see BooleanQuery#getMaxClauseCount
221: * @see #getMaxQueryTerms
222: * @see #setMaxQueryTerms
223: */
224: public static final int DEFAULT_MAX_QUERY_TERMS = 25;
225:
226: /**
227: * Analyzer that will be used to parse the doc.
228: */
229: private Analyzer analyzer = DEFAULT_ANALYZER;
230:
231: /**
232: * Ignore words less freqent that this.
233: */
234: private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
235:
236: /**
237: * Ignore words which do not occur in at least this many docs.
238: */
239: private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
240:
241: /**
242: * Should we apply a boost to the Query based on the scores?
243: */
244: private boolean boost = DEFAULT_BOOST;
245:
246: /**
247: * Field name we'll analyze.
248: */
249: private String[] fieldNames = DEFAULT_FIELD_NAMES;
250:
251: /**
252: * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
253: */
254: private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
255:
256: /**
257: * Ignore words if less than this len.
258: */
259: private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
260:
261: /**
262: * Ignore words if greater than this len.
263: */
264: private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
265:
266: /**
267: * Don't return a query longer than this.
268: */
269: private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
270:
271: /**
272: * For idf() calculations.
273: */
274: private Similarity similarity = new DefaultSimilarity();
275:
276: /**
277: * IndexReader to use
278: */
279: private final IndexReader ir;
280:
281: /**
282: * Constructor requiring an IndexReader.
283: */
284: public MoreLikeThis(IndexReader ir) {
285: this .ir = ir;
286: }
287:
288: /**
289: * Returns an analyzer that will be used to parse source doc with. The default analyzer
290: * is the {@link #DEFAULT_ANALYZER}.
291: *
292: * @return the analyzer that will be used to parse source doc with.
293: * @see #DEFAULT_ANALYZER
294: */
295: public Analyzer getAnalyzer() {
296: return analyzer;
297: }
298:
299: /**
300: * Sets the analyzer to use. An analyzer is not required for generating a query with the
301: * {@link #like(int)} method, all other 'like' methods require an analyzer.
302: *
303: * @param analyzer the analyzer to use to tokenize text.
304: */
305: public void setAnalyzer(Analyzer analyzer) {
306: this .analyzer = analyzer;
307: }
308:
309: /**
310: * Returns the frequency below which terms will be ignored in the source doc. The default
311: * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
312: *
313: * @return the frequency below which terms will be ignored in the source doc.
314: */
315: public int getMinTermFreq() {
316: return minTermFreq;
317: }
318:
319: /**
320: * Sets the frequency below which terms will be ignored in the source doc.
321: *
322: * @param minTermFreq the frequency below which terms will be ignored in the source doc.
323: */
324: public void setMinTermFreq(int minTermFreq) {
325: this .minTermFreq = minTermFreq;
326: }
327:
328: /**
329: * Returns the frequency at which words will be ignored which do not occur in at least this
330: * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}.
331: *
332: * @return the frequency at which words will be ignored which do not occur in at least this
333: * many docs.
334: */
335: public int getMinDocFreq() {
336: return minDocFreq;
337: }
338:
339: /**
340: * Sets the frequency at which words will be ignored which do not occur in at least this
341: * many docs.
342: *
343: * @param minDocFreq the frequency at which words will be ignored which do not occur in at
344: * least this many docs.
345: */
346: public void setMinDocFreq(int minDocFreq) {
347: this .minDocFreq = minDocFreq;
348: }
349:
350: /**
351: * Returns whether to boost terms in query based on "score" or not. The default is
352: * {@link #DEFAULT_BOOST}.
353: *
354: * @return whether to boost terms in query based on "score" or not.
355: * @see #setBoost
356: */
357: public boolean isBoost() {
358: return boost;
359: }
360:
361: /**
362: * Sets whether to boost terms in query based on "score" or not.
363: *
364: * @param boost true to boost terms in query based on "score", false otherwise.
365: * @see #isBoost
366: */
367: public void setBoost(boolean boost) {
368: this .boost = boost;
369: }
370:
371: /**
372: * Returns the field names that will be used when generating the 'More Like This' query.
373: * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
374: *
375: * @return the field names that will be used when generating the 'More Like This' query.
376: */
377: public String[] getFieldNames() {
378: return fieldNames;
379: }
380:
381: /**
382: * Sets the field names that will be used when generating the 'More Like This' query.
383: * Set this to null for the field names to be determined at runtime from the IndexReader
384: * provided in the constructor.
385: *
386: * @param fieldNames the field names that will be used when generating the 'More Like This'
387: * query.
388: */
389: public void setFieldNames(String[] fieldNames) {
390: this .fieldNames = fieldNames;
391: }
392:
393: /**
394: * Returns the minimum word length below which words will be ignored. Set this to 0 for no
395: * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
396: *
397: * @return the minimum word length below which words will be ignored.
398: */
399: public int getMinWordLen() {
400: return minWordLen;
401: }
402:
403: /**
404: * Sets the minimum word length below which words will be ignored.
405: *
406: * @param minWordLen the minimum word length below which words will be ignored.
407: */
408: public void setMinWordLen(int minWordLen) {
409: this .minWordLen = minWordLen;
410: }
411:
412: /**
413: * Returns the maximum word length above which words will be ignored. Set this to 0 for no
414: * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
415: *
416: * @return the maximum word length above which words will be ignored.
417: */
418: public int getMaxWordLen() {
419: return maxWordLen;
420: }
421:
422: /**
423: * Sets the maximum word length above which words will be ignored.
424: *
425: * @param maxWordLen the maximum word length above which words will be ignored.
426: */
427: public void setMaxWordLen(int maxWordLen) {
428: this .maxWordLen = maxWordLen;
429: }
430:
431: /**
432: * Set the set of stopwords.
433: * Any word in this set is considered "uninteresting" and ignored.
434: * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
435: * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
436: *
437: * @param stopWords set of stopwords, if null it means to allow stop words
438: *
439: * @see org.apache.lucene.analysis.StopFilter#makeStopSet StopFilter.makeStopSet()
440: * @see #getStopWords
441: */
442: public void setStopWords(Set stopWords) {
443: this .stopWords = stopWords;
444: }
445:
446: /**
447: * Get the current stop words being used.
448: * @see #setStopWords
449: */
450: public Set getStopWords() {
451: return stopWords;
452: }
453:
454: /**
455: * Returns the maximum number of query terms that will be included in any generated query.
456: * The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
457: *
458: * @return the maximum number of query terms that will be included in any generated query.
459: */
460: public int getMaxQueryTerms() {
461: return maxQueryTerms;
462: }
463:
464: /**
465: * Sets the maximum number of query terms that will be included in any generated query.
466: *
467: * @param maxQueryTerms the maximum number of query terms that will be included in any
468: * generated query.
469: */
470: public void setMaxQueryTerms(int maxQueryTerms) {
471: this .maxQueryTerms = maxQueryTerms;
472: }
473:
474: /**
475: * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
476: * @see #DEFAULT_MAX_NUM_TOKENS_PARSED
477: */
478: public int getMaxNumTokensParsed() {
479: return maxNumTokensParsed;
480: }
481:
482: /**
483: * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
484: */
485: public void setMaxNumTokensParsed(int i) {
486: maxNumTokensParsed = i;
487: }
488:
489: /**
490: * Return a query that will return docs like the passed lucene document ID.
491: *
492: * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for.
493: * @return a query that will return docs like the passed lucene document ID.
494: */
495: public Query like(int docNum) throws IOException {
496: if (fieldNames == null) {
497: // gather list of valid fields from lucene
498: Collection fields = ir
499: .getFieldNames(IndexReader.FieldOption.INDEXED);
500: fieldNames = (String[]) fields.toArray(new String[fields
501: .size()]);
502: }
503:
504: return createQuery(retrieveTerms(docNum));
505: }
506:
507: /**
508: * Return a query that will return docs like the passed file.
509: *
510: * @return a query that will return docs like the passed file.
511: */
512: public Query like(File f) throws IOException {
513: if (fieldNames == null) {
514: // gather list of valid fields from lucene
515: Collection fields = ir
516: .getFieldNames(IndexReader.FieldOption.INDEXED);
517: fieldNames = (String[]) fields.toArray(new String[fields
518: .size()]);
519: }
520:
521: return like(new FileReader(f));
522: }
523:
524: /**
525: * Return a query that will return docs like the passed URL.
526: *
527: * @return a query that will return docs like the passed URL.
528: */
529: public Query like(URL u) throws IOException {
530: return like(new InputStreamReader(u.openConnection()
531: .getInputStream()));
532: }
533:
534: /**
535: * Return a query that will return docs like the passed stream.
536: *
537: * @return a query that will return docs like the passed stream.
538: */
539: public Query like(java.io.InputStream is) throws IOException {
540: return like(new InputStreamReader(is));
541: }
542:
543: /**
544: * Return a query that will return docs like the passed Reader.
545: *
546: * @return a query that will return docs like the passed Reader.
547: */
548: public Query like(Reader r) throws IOException {
549: return createQuery(retrieveTerms(r));
550: }
551:
552: /**
553: * Create the More like query from a PriorityQueue
554: */
555: private Query createQuery(PriorityQueue q) {
556: BooleanQuery query = new BooleanQuery();
557: Object cur;
558: int qterms = 0;
559: float bestScore = 0;
560:
561: while (((cur = q.pop()) != null)) {
562: Object[] ar = (Object[]) cur;
563: TermQuery tq = new TermQuery(new Term((String) ar[1],
564: (String) ar[0]));
565:
566: if (boost) {
567: if (qterms == 0) {
568: bestScore = ((Float) ar[2]).floatValue();
569: }
570: float myScore = ((Float) ar[2]).floatValue();
571:
572: tq.setBoost(myScore / bestScore);
573: }
574:
575: try {
576: query.add(tq, BooleanClause.Occur.SHOULD);
577: } catch (BooleanQuery.TooManyClauses ignore) {
578: break;
579: }
580:
581: qterms++;
582: if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
583: break;
584: }
585: }
586:
587: return query;
588: }
589:
590: /**
591: * Create a PriorityQueue from a word->tf map.
592: *
593: * @param words a map of words keyed on the word(String) with Int objects as the values.
594: */
595: private PriorityQueue createQueue(Map words) throws IOException {
596: // have collected all words in doc and their freqs
597: int numDocs = ir.numDocs();
598: FreqQ res = new FreqQ(words.size()); // will order words by score
599:
600: Iterator it = words.keySet().iterator();
601: while (it.hasNext()) { // for every word
602: String word = (String) it.next();
603:
604: int tf = ((Int) words.get(word)).x; // term freq in the source doc
605: if (minTermFreq > 0 && tf < minTermFreq) {
606: continue; // filter out words that don't occur enough times in the source
607: }
608:
609: // go through all the fields and find the largest document frequency
610: String topField = fieldNames[0];
611: int docFreq = 0;
612: for (int i = 0; i < fieldNames.length; i++) {
613: int freq = ir.docFreq(new Term(fieldNames[i], word));
614: topField = (freq > docFreq) ? fieldNames[i] : topField;
615: docFreq = (freq > docFreq) ? freq : docFreq;
616: }
617:
618: if (minDocFreq > 0 && docFreq < minDocFreq) {
619: continue; // filter out words that don't occur in enough docs
620: }
621:
622: if (docFreq == 0) {
623: continue; // index update problem?
624: }
625:
626: float idf = similarity.idf(docFreq, numDocs);
627: float score = tf * idf;
628:
629: // only really need 1st 3 entries, other ones are for troubleshooting
630: res.insert(new Object[] { word, // the word
631: topField, // the top field
632: new Float(score), // overall score
633: new Float(idf), // idf
634: new Integer(docFreq), // freq in all docs
635: new Integer(tf) });
636: }
637: return res;
638: }
639:
640: /**
641: * Describe the parameters that control how the "more like this" query is formed.
642: */
643: public String describeParams() {
644: StringBuffer sb = new StringBuffer();
645: sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
646: sb.append("\t" + "minWordLen : " + minWordLen + "\n");
647: sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
648: sb.append("\t" + "fieldNames : ");
649: String delim = "";
650: for (int i = 0; i < fieldNames.length; i++) {
651: String fieldName = fieldNames[i];
652: sb.append(delim).append(fieldName);
653: delim = ", ";
654: }
655: sb.append("\n");
656: sb.append("\t" + "boost : " + boost + "\n");
657: sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
658: sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
659: return sb.toString();
660: }
661:
662: /**
663: * Test driver.
664: * Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
665: */
666: public static void main(String[] a) throws Throwable {
667: String indexName = "localhost_index";
668: String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
669: URL url = null;
670: for (int i = 0; i < a.length; i++) {
671: if (a[i].equals("-i")) {
672: indexName = a[++i];
673: } else if (a[i].equals("-f")) {
674: fn = a[++i];
675: } else if (a[i].equals("-url")) {
676: url = new URL(a[++i]);
677: }
678: }
679:
680: PrintStream o = System.out;
681: IndexReader r = IndexReader.open(indexName);
682: o.println("Open index " + indexName + " which has "
683: + r.numDocs() + " docs");
684:
685: MoreLikeThis mlt = new MoreLikeThis(r);
686:
687: o.println("Query generation parameters:");
688: o.println(mlt.describeParams());
689: o.println();
690:
691: Query query = null;
692: if (url != null) {
693: o.println("Parsing URL: " + url);
694: query = mlt.like(url);
695: } else if (fn != null) {
696: o.println("Parsing file: " + fn);
697: query = mlt.like(new File(fn));
698: }
699:
700: o.println("q: " + query);
701: o.println();
702: IndexSearcher searcher = new IndexSearcher(indexName);
703:
704: Hits hits = searcher.search(query);
705: int len = hits.length();
706: o.println("found: " + len + " documents matching");
707: o.println();
708: for (int i = 0; i < Math.min(25, len); i++) {
709: Document d = hits.doc(i);
710: String summary = d.get("summary");
711: o.println("score : " + hits.score(i));
712: o.println("url : " + d.get("url"));
713: o.println("\ttitle : " + d.get("title"));
714: if (summary != null)
715: o.println("\tsummary: " + d.get("summary"));
716: o.println();
717: }
718: }
719:
720: /**
721: * Find words for a more-like-this query former.
722: *
723: * @param docNum the id of the lucene document from which to find terms
724: */
725: private PriorityQueue retrieveTerms(int docNum) throws IOException {
726: Map termFreqMap = new HashMap();
727: for (int i = 0; i < fieldNames.length; i++) {
728: String fieldName = fieldNames[i];
729: TermFreqVector vector = ir.getTermFreqVector(docNum,
730: fieldName);
731:
732: // field does not store term vector info
733: if (vector == null) {
734: Document d = ir.document(docNum);
735: String text[] = d.getValues(fieldName);
736: if (text != null) {
737: for (int j = 0; j < text.length; j++) {
738: addTermFrequencies(new StringReader(text[j]),
739: termFreqMap, fieldName);
740: }
741: }
742: } else {
743: addTermFrequencies(termFreqMap, vector);
744: }
745:
746: }
747:
748: return createQueue(termFreqMap);
749: }
750:
751: /**
752: * Adds terms and frequencies found in vector into the Map termFreqMap
753: * @param termFreqMap a Map of terms and their frequencies
754: * @param vector List of terms and their frequencies for a doc/field
755: */
756: private void addTermFrequencies(Map termFreqMap,
757: TermFreqVector vector) {
758: String[] terms = vector.getTerms();
759: int freqs[] = vector.getTermFrequencies();
760: for (int j = 0; j < terms.length; j++) {
761: String term = terms[j];
762:
763: if (isNoiseWord(term)) {
764: continue;
765: }
766: // increment frequency
767: Int cnt = (Int) termFreqMap.get(term);
768: if (cnt == null) {
769: cnt = new Int();
770: termFreqMap.put(term, cnt);
771: cnt.x = freqs[j];
772: } else {
773: cnt.x += freqs[j];
774: }
775: }
776: }
777:
778: /**
779: * Adds term frequencies found by tokenizing text from reader into the Map words
780: * @param r a source of text to be tokenized
781: * @param termFreqMap a Map of terms and their frequencies
782: * @param fieldName Used by analyzer for any special per-field analysis
783: */
784: private void addTermFrequencies(Reader r, Map termFreqMap,
785: String fieldName) throws IOException {
786: TokenStream ts = analyzer.tokenStream(fieldName, r);
787: org.apache.lucene.analysis.Token token;
788: int tokenCount = 0;
789: while ((token = ts.next()) != null) { // for every token
790: String word = token.termText();
791: tokenCount++;
792: if (tokenCount > maxNumTokensParsed) {
793: break;
794: }
795: if (isNoiseWord(word)) {
796: continue;
797: }
798:
799: // increment frequency
800: Int cnt = (Int) termFreqMap.get(word);
801: if (cnt == null) {
802: termFreqMap.put(word, new Int());
803: } else {
804: cnt.x++;
805: }
806: }
807: }
808:
809: /** determines if the passed term is likely to be of interest in "more like" comparisons
810: *
811: * @param term The word being considered
812: * @return true if should be ignored, false if should be used in further analysis
813: */
814: private boolean isNoiseWord(String term) {
815: int len = term.length();
816: if (minWordLen > 0 && len < minWordLen) {
817: return true;
818: }
819: if (maxWordLen > 0 && len > maxWordLen) {
820: return true;
821: }
822: if (stopWords != null && stopWords.contains(term)) {
823: return true;
824: }
825: return false;
826: }
827:
828: /**
829: * Find words for a more-like-this query former.
830: * The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
831: * Each array has 6 elements.
832: * The elements are:
833: * <ol>
834: * <li> The word (String)
835: * <li> The top field that this word comes from (String)
836: * <li> The score for this word (Float)
837: * <li> The IDF value (Float)
838: * <li> The frequency of this word in the index (Integer)
839: * <li> The frequency of this word in the source document (Integer)
840: * </ol>
841: * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
842: * This method is exposed so that you can identify the "interesting words" in a document.
843: * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
844: *
845: * @param r the reader that has the content of the document
846: * @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
847: *
848: * @see #retrieveInterestingTerms
849: */
850: public PriorityQueue retrieveTerms(Reader r) throws IOException {
851: Map words = new HashMap();
852: for (int i = 0; i < fieldNames.length; i++) {
853: String fieldName = fieldNames[i];
854: addTermFrequencies(r, words, fieldName);
855: }
856: return createQueue(words);
857: }
858:
859: /**
860: * Convenience routine to make it easy to return the most interesting words in a document.
861: * More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
862: * @param r the source document
863: * @return the most interesting words in the document
864: *
865: * @see #retrieveTerms(java.io.Reader)
866: * @see #setMaxQueryTerms
867: */
868: public String[] retrieveInterestingTerms(Reader r)
869: throws IOException {
870: ArrayList al = new ArrayList(maxQueryTerms);
871: PriorityQueue pq = retrieveTerms(r);
872: Object cur;
873: int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
874: // we just want to return the top words
875: while (((cur = pq.pop()) != null) && lim-- > 0) {
876: Object[] ar = (Object[]) cur;
877: al.add(ar[0]); // the 1st entry is the interesting word
878: }
879: String[] res = new String[al.size()];
880: return (String[]) al.toArray(res);
881: }
882:
883: /**
884: * PriorityQueue that orders words by score.
885: */
886: private static class FreqQ extends PriorityQueue {
887: FreqQ(int s) {
888: initialize(s);
889: }
890:
891: protected boolean lessThan(Object a, Object b) {
892: Object[] aa = (Object[]) a;
893: Object[] bb = (Object[]) b;
894: Float fa = (Float) aa[2];
895: Float fb = (Float) bb[2];
896: return fa.floatValue() > fb.floatValue();
897: }
898: }
899:
900: /**
901: * Use for frequencies and to avoid renewing Integers.
902: */
903: private static class Int {
904: int x;
905:
906: Int() {
907: x = 1;
908: }
909: }
910:
911: }
|