001: package org.apache.lucene.wordnet;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.store.*;
021: import org.apache.lucene.search.*;
022: import org.apache.lucene.index.*;
023: import org.apache.lucene.document.*;
024: import org.apache.lucene.analysis.*;
025: import org.apache.lucene.analysis.standard.*;
026: import java.io.*;
027: import java.util.*;
028:
029: /**
030: * Expand a query by looking up synonyms for every term.
031: * You need to invoke {@link Syns2Index} first to build the synonym index.
032: *
033: * @see Syns2Index
034: */
035: public final class SynExpand {
036:
037: /**
038: * Test driver for synonym expansion.
039: * Uses boost factor of 0.9 for illustrative purposes.
040: *
041: * If you pass in the query "big dog" then it prints out:
042: *
043: * <code><pre>
044: * Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 boastfully^0.9 bounteous^0.9 bountiful^0.9 braggy^0.9 crowing^0.9 freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9 large^0.9 liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 swelled^0.9 vainglorious^0.9 vauntingly^0.9
045: * dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 click^0.9 detent^0.9 dogtooth^0.9 firedog^0.9 frank^0.9 frankfurter^0.9 frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9 tail^0.9 track^0.9 trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9
046: * </pre></code>
047: */
048: public static void main(String[] args) throws IOException {
049: if (args.length != 2) {
050: System.out
051: .println("java org.apache.lucene.wordnet.SynExpand <index path> <query>");
052: }
053:
054: FSDirectory directory = FSDirectory
055: .getDirectory(args[0], false);
056: IndexSearcher searcher = new IndexSearcher(directory);
057:
058: String query = args[1];
059: String field = "contents";
060:
061: Query q = expand(query, searcher, new StandardAnalyzer(),
062: field, 0.9f);
063: System.out.println("Query: " + q.toString(field));
064:
065: searcher.close();
066: directory.close();
067: }
068:
069: /**
070: * Perform synonym expansion on a query.
071: *
072: * @param query users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser.
073: *
074: * @param syns a opened to the Lucene index you previously created with {@link Syns2Index}. The searcher is not closed or otherwise altered.
075: *
076: * @param a optional analyzer used to parse the users query else {@link StandardAnalyzer} is used
077: *
078: * @param field optional field name to search in or null if you want the default of "contents"
079: *
080: * @param boost optional boost applied to synonyms else no boost is applied
081: *
082: * @return the expanded Query
083: */
084: public static Query expand(String query, Searcher syns, Analyzer a,
085: String field, float boost) throws IOException {
086: Set already = new HashSet(); // avoid dups
087: List top = new LinkedList(); // needs to be separately listed..
088: if (field == null)
089: field = "contents";
090: if (a == null)
091: a = new StandardAnalyzer();
092:
093: // [1] Parse query into separate words so that when we expand we can avoid dups
094: TokenStream ts = a.tokenStream(field, new StringReader(query));
095: org.apache.lucene.analysis.Token t;
096: while ((t = ts.next()) != null) {
097: String word = t.termText();
098: if (already.add(word))
099: top.add(word);
100: }
101: BooleanQuery tmp = new BooleanQuery();
102:
103: // [2] form query
104: Iterator it = top.iterator();
105: while (it.hasNext()) {
106: // [2a] add to level words in
107: String word = (String) it.next();
108: TermQuery tq = new TermQuery(new Term(field, word));
109: tmp.add(tq, BooleanClause.Occur.SHOULD);
110:
111: // [2b] add in unique synonums
112: Hits hits = syns.search(new TermQuery(new Term(
113: Syns2Index.F_WORD, word)));
114: for (int i = 0; i < hits.length(); i++) {
115: Document doc = hits.doc(i);
116: String[] values = doc.getValues(Syns2Index.F_SYN);
117: for (int j = 0; j < values.length; j++) {
118: String syn = values[j];
119: if (already.add(syn)) // avoid dups of top level words and synonyms
120: {
121: tq = new TermQuery(new Term(field, syn));
122: if (boost > 0) // else keep normal 1.0
123: tq.setBoost(boost);
124: tmp.add(tq, BooleanClause.Occur.SHOULD);
125: }
126: }
127: }
128: }
129:
130: return tmp;
131: }
132:
133: }
|