001: package lucli;
002:
003: /* ====================================================================
004: * The Apache Software License, Version 1.1
005: *
006: * Copyright (c) 2001 The Apache Software Foundation. All rights
007: * reserved.
008: *
009: * Redistribution and use in source and binary forms, with or without
010: * modification, are permitted provided that the following conditions
011: * are met:
012: *
013: * 1. Redistributions of source code must retain the above copyright
014: * notice, this list of conditions and the following disclaimer.
015: *
016: * 2. Redistributions in binary form must reproduce the above copyright
017: * notice, this list of conditions and the following disclaimer in
018: * the documentation and/or other materials provided with the
019: * distribution.
020: *
021: * 3. The end-user documentation included with the redistribution,
022: * if any, must include the following acknowledgment:
023: * "This product includes software developed by the
024: * Apache Software Foundation (http://www.apache.org/)."
025: * Alternately, this acknowledgment may appear in the software itself,
026: * if and wherever such third-party acknowledgments normally appear.
027: *
028: * 4. The names "Apache" and "Apache Software Foundation" and
029: * "Apache Lucene" must not be used to endorse or promote products
030: * derived from this software without prior written permission. For
031: * written permission, please contact apache@apache.org.
032: *
033: * 5. Products derived from this software may not be called "Apache",
034: * "Apache Lucene", nor may "Apache" appear in their name, without
035: * prior written permission of the Apache Software Foundation.
036: *
037: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
038: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
039: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
040: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
041: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
042: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
043: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
044: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
045: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
046: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
047: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
048: * SUCH DAMAGE.
049: * ====================================================================
050: *
051: * This software consists of voluntary contributions made by many
052: * individuals on behalf of the Apache Software Foundation. For more
053: * information on the Apache Software Foundation, please see
054: * <http://www.apache.org/>.
055: */
056:
057: import java.io.IOException;
058: import java.io.Reader;
059: import java.io.StringReader;
060:
061: import java.util.Hashtable;
062: import java.util.Vector;
063: import java.util.TreeMap;
064: import java.util.Map.Entry;
065: import java.util.Set;
066: import java.util.Arrays;
067: import java.util.Comparator;
068: import java.util.Iterator;
069: import java.util.Enumeration;
070:
071: import jline.ConsoleReader;
072:
073: import org.apache.lucene.analysis.Analyzer;
074: import org.apache.lucene.analysis.Token;
075: import org.apache.lucene.analysis.TokenStream;
076: import org.apache.lucene.analysis.standard.StandardAnalyzer;
077: import org.apache.lucene.document.Document;
078: import org.apache.lucene.document.Field;
079: import org.apache.lucene.index.IndexReader;
080: import org.apache.lucene.index.IndexWriter;
081: import org.apache.lucene.index.Term;
082: import org.apache.lucene.index.TermEnum;
083: import org.apache.lucene.index.IndexReader.FieldOption;
084: import org.apache.lucene.queryParser.MultiFieldQueryParser;
085: import org.apache.lucene.queryParser.ParseException;
086: import org.apache.lucene.search.Explanation;
087: import org.apache.lucene.search.Hits;
088: import org.apache.lucene.search.IndexSearcher;
089: import org.apache.lucene.search.Query;
090: import org.apache.lucene.search.Searcher;
091:
092: /**
093: * Various methods that interact with Lucene and provide info about the
094: * index, search, etc. Parts addapted from Lucene demo.
095: */
096: class LuceneMethods {
097:
098: private int numDocs;
099: private String indexName; //directory of this index
100: private java.util.Iterator fieldIterator;
101: private Vector fields; //Fields as a vector
102: private Vector indexedFields; //Fields as a vector
103: private String fieldsArray[]; //Fields as an array
104: private Searcher searcher;
105: private Query query; //current query string
106:
107: public LuceneMethods(String index) {
108: indexName = index;
109: message("Lucene CLI. Using directory '" + indexName
110: + "'. Type 'help' for instructions.");
111: }
112:
113: public void info() throws java.io.IOException {
114: IndexReader indexReader = IndexReader.open(indexName);
115:
116: getFieldInfo();
117: numDocs = indexReader.numDocs();
118: message("Index has " + numDocs + " documents ");
119: message("All Fields:" + fields.toString());
120: message("Indexed Fields:" + indexedFields.toString());
121:
122: if (IndexReader.isLocked(indexName)) {
123: message("Index is locked");
124: }
125: //IndexReader.getCurrentVersion(indexName);
126: //System.out.println("Version:" + version);
127:
128: indexReader.close();
129: }
130:
131: public void search(String queryString, boolean explain,
132: boolean showTokens, ConsoleReader cr)
133: throws java.io.IOException,
134: org.apache.lucene.queryParser.ParseException {
135: Hits hits = initSearch(queryString);
136: System.out.println(hits.length() + " total matching documents");
137: if (explain) {
138: query = explainQuery(queryString);
139: }
140:
141: final int HITS_PER_PAGE = 10;
142: message("--------------------------------------");
143: for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
144: int end = Math.min(hits.length(), start + HITS_PER_PAGE);
145: for (int ii = start; ii < end; ii++) {
146: Document doc = hits.doc(ii);
147: message("---------------- " + (ii + 1) + " score:"
148: + hits.score(ii) + "---------------------");
149: printHit(doc);
150: if (showTokens) {
151: invertDocument(doc);
152: }
153: if (explain) {
154: Explanation exp = searcher.explain(query, hits
155: .id(ii));
156: message("Explanation:" + exp.toString());
157: }
158: }
159: message("#################################################");
160:
161: if (hits.length() > end) {
162: // TODO: don't let the input end up in the command line history
163: queryString = cr.readLine("more (y/n) ? ");
164: if (queryString.length() == 0
165: || queryString.charAt(0) == 'n')
166: break;
167: }
168: }
169: searcher.close();
170: }
171:
172: /**
173: * @todo Allow user to specify what field(s) to display
174: */
175: private void printHit(Document doc) {
176: for (int ii = 0; ii < fieldsArray.length; ii++) {
177: String currField = fieldsArray[ii];
178: String[] result = doc.getValues(currField);
179: if (result != null) {
180: for (int i = 0; i < result.length; i++) {
181: message(currField + ":" + result[i]);
182: }
183: } else {
184: message(currField + ": <not available>");
185: }
186: }
187: //another option is to just do message(doc);
188: }
189:
190: public void optimize() throws IOException {
191: //open the index writer. False: don't create a new one
192: IndexWriter indexWriter = new IndexWriter(indexName,
193: new StandardAnalyzer(), false);
194: message("Starting to optimize index.");
195: long start = System.currentTimeMillis();
196: indexWriter.optimize();
197: message("Done optimizing index. Took "
198: + (System.currentTimeMillis() - start) + " msecs");
199: indexWriter.close();
200: }
201:
202: private Query explainQuery(String queryString) throws IOException,
203: ParseException {
204:
205: searcher = new IndexSearcher(indexName);
206: Analyzer analyzer = new StandardAnalyzer();
207: getFieldInfo();
208:
209: int arraySize = indexedFields.size();
210: String indexedArray[] = new String[arraySize];
211: for (int ii = 0; ii < arraySize; ii++) {
212: indexedArray[ii] = (String) indexedFields.get(ii);
213: }
214: MultiFieldQueryParser parser = new MultiFieldQueryParser(
215: indexedArray, analyzer);
216: query = parser.parse(queryString);
217: System.out.println("Searching for: " + query.toString());
218: return (query);
219:
220: }
221:
222: /**
223: * @todo Allow user to specify analyzer
224: */
225: private Hits initSearch(String queryString) throws IOException,
226: ParseException {
227:
228: searcher = new IndexSearcher(indexName);
229: Analyzer analyzer = new StandardAnalyzer();
230: getFieldInfo();
231:
232: int arraySize = fields.size();
233: fieldsArray = new String[arraySize];
234: for (int ii = 0; ii < arraySize; ii++) {
235: fieldsArray[ii] = (String) fields.get(ii);
236: }
237: MultiFieldQueryParser parser = new MultiFieldQueryParser(
238: fieldsArray, analyzer);
239: query = parser.parse(queryString);
240: System.out.println("Searching for: " + query.toString());
241: Hits hits = searcher.search(query);
242: return (hits);
243:
244: }
245:
246: public void count(String queryString) throws java.io.IOException,
247: ParseException {
248: Hits hits = initSearch(queryString);
249: System.out.println(hits.length() + " total documents");
250: searcher.close();
251: }
252:
253: static public void message(String s) {
254: System.out.println(s);
255: }
256:
257: private void getFieldInfo() throws IOException {
258: IndexReader indexReader = IndexReader.open(indexName);
259: fields = new Vector();
260: indexedFields = new Vector();
261:
262: //get the list of all field names
263: fieldIterator = indexReader.getFieldNames(FieldOption.ALL)
264: .iterator();
265: while (fieldIterator.hasNext()) {
266: Object field = fieldIterator.next();
267: if (field != null && !field.equals(""))
268: fields.add(field.toString());
269: }
270: //
271: //get the list of indexed field names
272: fieldIterator = indexReader.getFieldNames(FieldOption.INDEXED)
273: .iterator();
274: while (fieldIterator.hasNext()) {
275: Object field = fieldIterator.next();
276: if (field != null && !field.equals(""))
277: indexedFields.add(field.toString());
278: }
279: indexReader.close();
280: }
281:
282: // Copied from DocumentWriter
283: // Tokenizes the fields of a document into Postings.
284: private void invertDocument(Document doc) throws IOException {
285:
286: Hashtable tokenHash = new Hashtable();
287: final int maxFieldLength = 10000;
288:
289: Analyzer analyzer = new StandardAnalyzer();
290: Enumeration fields = doc.fields();
291: while (fields.hasMoreElements()) {
292: Field field = (Field) fields.nextElement();
293: String fieldName = field.name();
294:
295: if (field.isIndexed()) {
296: if (field.isTokenized()) { // un-tokenized field
297: Reader reader; // find or make Reader
298: if (field.readerValue() != null)
299: reader = field.readerValue();
300: else if (field.stringValue() != null)
301: reader = new StringReader(field.stringValue());
302: else
303: throw new IllegalArgumentException(
304: "field must have either String or Reader value");
305:
306: int position = 0;
307: // Tokenize field and add to postingTable
308: TokenStream stream = analyzer.tokenStream(
309: fieldName, reader);
310: try {
311: for (Token t = stream.next(); t != null; t = stream
312: .next()) {
313: position += (t.getPositionIncrement() - 1);
314: position++;
315: String name = t.termText();
316: Integer Count = (Integer) tokenHash
317: .get(name);
318: if (Count == null) { // not in there yet
319: tokenHash.put(name, new Integer(1)); //first one
320: } else {
321: int count = Count.intValue();
322: tokenHash.put(name, new Integer(
323: count + 1));
324: }
325: if (position > maxFieldLength)
326: break;
327: }
328: } finally {
329: stream.close();
330: }
331: }
332:
333: }
334: }
335: Entry[] sortedHash = getSortedHashtableEntries(tokenHash);
336: for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) {
337: Entry currentEntry = sortedHash[ii];
338: message((ii + 1) + ":" + currentEntry.getKey() + " "
339: + currentEntry.getValue());
340: }
341: }
342:
343: /** Provides a list of the top terms of the index.
344: *
345: * @param field - the name of the command or null for all of them.
346: */
347: public void terms(String field) throws IOException {
348: TreeMap termMap = new TreeMap();
349: IndexReader indexReader = IndexReader.open(indexName);
350: TermEnum terms = indexReader.terms();
351: while (terms.next()) {
352: Term term = terms.term();
353: //message(term.field() + ":" + term.text() + " freq:" + terms.docFreq());
354: //if we're either not looking by field or we're matching the specific field
355: if ((field == null) || field.equals(term.field()))
356: termMap.put(term.field() + ":" + term.text(),
357: new Integer((terms.docFreq())));
358: }
359:
360: Iterator termIterator = termMap.keySet().iterator();
361: for (int ii = 0; termIterator.hasNext() && ii < 100; ii++) {
362: String termDetails = (String) termIterator.next();
363: Integer termFreq = (Integer) termMap.get(termDetails);
364: message(termDetails + ": " + termFreq);
365: }
366: indexReader.close();
367: }
368:
369: /** Sort Hashtable values
370: * @param h the hashtable we're sorting
371: * from http://developer.java.sun.com/developer/qow/archive/170/index.jsp
372: */
373:
374: public static Entry[] getSortedHashtableEntries(Hashtable h) {
375: Set set = h.entrySet();
376: Entry[] entries = (Entry[]) set.toArray(new Entry[set.size()]);
377: Arrays.sort(entries, new Comparator() {
378: public int compare(Object o1, Object o2) {
379: Object v1 = ((Entry) o1).getValue();
380: Object v2 = ((Entry) o2).getValue();
381: return ((Comparable) v2).compareTo(v1); //descending order
382: }
383: });
384: return entries;
385: }
386:
387: }
|