01: package org.apache.lucene.misc;
02:
03: /**
04: * Copyright 2004 The Apache Software Foundation
05: *
06: * Licensed under the Apache License, Version 2.0 (the "License");
07: * you may not use this file except in compliance with the License.
08: * You may obtain a copy of the License at
09: *
10: * http://www.apache.org/licenses/LICENSE-2.0
11: *
12: * Unless required by applicable law or agreed to in writing, software
13: * distributed under the License is distributed on an "AS IS" BASIS,
14: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15: * See the License for the specific language governing permissions and
16: * limitations under the License.
17: */
18:
19: import org.apache.lucene.index.IndexReader;
20: import org.apache.lucene.index.Term;
21: import org.apache.lucene.index.TermEnum;
22: import org.apache.lucene.util.PriorityQueue;
23:
24: /**
25: * <code>HighFreqTerms</code> class extracts terms and their frequencies out
26: * of an existing Lucene index.
27: *
28: * @version $Id: HighFreqTerms.java 376393 2006-02-09 19:17:14Z otis $
29: */
30: public class HighFreqTerms {
31:
32: // The top numTerms will be displayed
33: public static final int numTerms = 100;
34:
35: public static void main(String[] args) throws Exception {
36: IndexReader reader = null;
37: String field = null;
38: if (args.length == 1) {
39: reader = IndexReader.open(args[0]);
40: } else if (args.length == 2) {
41: reader = IndexReader.open(args[0]);
42: field = args[1];
43: } else {
44: usage();
45: System.exit(1);
46: }
47:
48: TermInfoQueue tiq = new TermInfoQueue(numTerms);
49: TermEnum terms = reader.terms();
50:
51: if (field != null) {
52: while (terms.next()) {
53: if (terms.term().field().equals(field)) {
54: tiq.insert(new TermInfo(terms.term(), terms
55: .docFreq()));
56: }
57: }
58: } else {
59: while (terms.next()) {
60: tiq.insert(new TermInfo(terms.term(), terms.docFreq()));
61: }
62: }
63: while (tiq.size() != 0) {
64: TermInfo termInfo = (TermInfo) tiq.pop();
65: System.out.println(termInfo.term + " " + termInfo.docFreq);
66: }
67:
68: reader.close();
69: }
70:
71: private static void usage() {
72: System.out
73: .println("\n\n"
74: + "java org.apache.lucene.misc.HighFreqTerms <index dir> [field]\n\n");
75: }
76: }
77:
78: final class TermInfo {
79: TermInfo(Term t, int df) {
80: term = t;
81: docFreq = df;
82: }
83:
84: int docFreq;
85: Term term;
86: }
87:
88: final class TermInfoQueue extends PriorityQueue {
89: TermInfoQueue(int size) {
90: initialize(size);
91: }
92:
93: protected final boolean lessThan(Object a, Object b) {
94: TermInfo termInfoA = (TermInfo) a;
95: TermInfo termInfoB = (TermInfo) b;
96: return termInfoA.docFreq < termInfoB.docFreq;
97: }
98: }
|