001: package org.apache.lucene.wordnet;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.BufferedReader;
021: import java.io.File;
022: import java.io.FileInputStream;
023: import java.io.InputStreamReader;
024: import java.io.PrintStream;
025: import java.util.Iterator;
026: import java.util.LinkedList;
027: import java.util.List;
028: import java.util.Map;
029: import java.util.Set;
030: import java.util.TreeMap;
031: import java.util.TreeSet;
032:
033: import org.apache.lucene.analysis.Analyzer;
034: import org.apache.lucene.analysis.standard.StandardAnalyzer;
035: import org.apache.lucene.document.Document;
036: import org.apache.lucene.document.Field;
037: import org.apache.lucene.index.IndexWriter;
038:
039: /**
040: * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
041: * into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
042: *
043: * This has been tested with WordNet 2.0.
044: *
045: * The index has fields named "word" ({@link #F_WORD})
046: * and "syn" ({@link #F_SYN}).
047: * <p>
048: * The source word (such as 'big') can be looked up in the
049: * "word" field, and if present there will be fields named "syn"
050: * for every synonym. What's tricky here is that there could be <b>multiple</b>
051: * fields with the same name, in the general case for words that have multiple synonyms.
052: * That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}
053: * </p>
054: * <p>
055: * While the WordNet file distinguishes groups of synonyms with
056: * related meanings we don't do that here.
057: * </p>
058: *
059: * This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
060: *
061: * @author Dave Spencer, dave@searchmorph.com
062: * @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
063: * @see <a href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb man page</a>
064: * @see <a href="http://www.hostmon.com/rfc/advanced.jsp">sample site that uses it</a>
065: */
066: public class Syns2Index {
067: /**
068: *
069: */
070: private static final PrintStream o = System.out;
071:
072: /**
073: *
074: */
075: private static final PrintStream err = System.err;
076:
077: /**
078: *
079: */
080: public static final String F_SYN = "syn";
081:
082: /**
083: *
084: */
085: public static final String F_WORD = "word";
086:
087: /**
088: *
089: */
090: private static final Analyzer ana = new StandardAnalyzer();
091:
092: /**
093: * Takes arg of prolog file name and index directory.
094: */
095: public static void main(String[] args) throws Throwable {
096: // get command line arguments
097: String prologFilename = null; // name of file "wn_s.pl"
098: String indexDir = null;
099: if (args.length == 2) {
100: prologFilename = args[0];
101: indexDir = args[1];
102: } else {
103: usage();
104: System.exit(1);
105: }
106:
107: // ensure that the prolog file is readable
108: if (!(new File(prologFilename)).canRead()) {
109: err.println("Error: cannot read Prolog file: "
110: + prologFilename);
111: System.exit(1);
112: }
113: // exit if the target index directory already exists
114: if ((new File(indexDir)).isDirectory()) {
115: err.println("Error: index directory already exists: "
116: + indexDir);
117: err
118: .println("Please specify a name of a non-existent directory");
119: System.exit(1);
120: }
121:
122: o.println("Opening Prolog file " + prologFilename);
123: final FileInputStream fis = new FileInputStream(prologFilename);
124: final BufferedReader br = new BufferedReader(
125: new InputStreamReader(fis));
126: String line;
127:
128: // maps a word to all the "groups" it's in
129: final Map word2Nums = new TreeMap();
130: // maps a group to all the words in it
131: final Map num2Words = new TreeMap();
132: // number of rejected words
133: int ndecent = 0;
134:
135: // status output
136: int mod = 1;
137: int row = 1;
138: // parse prolog file
139: o.println("[1/2] Parsing " + prologFilename);
140: while ((line = br.readLine()) != null) {
141: // occasional progress
142: if ((++row) % mod == 0) // periodically print out line we read in
143: {
144: mod *= 2;
145: o.println("\t" + row + " " + line + " "
146: + word2Nums.size() + " " + num2Words.size()
147: + " ndecent=" + ndecent);
148: }
149:
150: // syntax check
151: if (!line.startsWith("s(")) {
152: err.println("OUCH: " + line);
153: System.exit(1);
154: }
155:
156: // parse line
157: line = line.substring(2);
158: int comma = line.indexOf(',');
159: String num = line.substring(0, comma);
160: int q1 = line.indexOf('\'');
161: line = line.substring(q1 + 1);
162: int q2 = line.indexOf('\'');
163: String word = line.substring(0, q2).toLowerCase();
164:
165: // make sure is a normal word
166: if (!isDecent(word)) {
167: ndecent++;
168: continue; // don't store words w/ spaces
169: }
170:
171: // 1/2: word2Nums map
172: // append to entry or add new one
173: List lis = (List) word2Nums.get(word);
174: if (lis == null) {
175: lis = new LinkedList();
176: lis.add(num);
177: word2Nums.put(word, lis);
178: } else
179: lis.add(num);
180:
181: // 2/2: num2Words map
182: lis = (List) num2Words.get(num);
183: if (lis == null) {
184: lis = new LinkedList();
185: lis.add(word);
186: num2Words.put(num, lis);
187: } else
188: lis.add(word);
189: }
190:
191: // close the streams
192: fis.close();
193: br.close();
194:
195: // create the index
196: o.println("[2/2] Building index to store synonyms, "
197: + " map sizes are " + word2Nums.size() + " and "
198: + num2Words.size());
199: index(indexDir, word2Nums, num2Words);
200: }
201:
202: /**
203: * Checks to see if a word contains only alphabetic characters by
204: * checking it one character at a time.
205: *
206: * @param s string to check
207: * @return <code>true</code> if the string is decent
208: */
209: private static boolean isDecent(String s) {
210: int len = s.length();
211: for (int i = 0; i < len; i++) {
212: if (!Character.isLetter(s.charAt(i))) {
213: return false;
214: }
215: }
216: return true;
217: }
218:
219: /**
220: * Forms a Lucene index based on the 2 maps.
221: *
222: * @param indexDir the direcotry where the index should be created
223: * @param word2Nums
224: * @param num2Words
225: */
226: private static void index(String indexDir, Map word2Nums,
227: Map num2Words) throws Throwable {
228: int row = 0;
229: int mod = 1;
230:
231: // override the specific index if it already exists
232: IndexWriter writer = new IndexWriter(indexDir, ana, true);
233: writer.setUseCompoundFile(true); // why?
234: // blindly up these parameters for speed
235: writer.setMergeFactor(writer.getMergeFactor() * 2);
236: writer.setMaxBufferedDocs(writer.getMaxBufferedDocs() * 2);
237: Iterator i1 = word2Nums.keySet().iterator();
238: while (i1.hasNext()) // for each word
239: {
240: String g = (String) i1.next();
241: Document doc = new Document();
242:
243: int n = index(word2Nums, num2Words, g, doc);
244: if (n > 0) {
245: doc.add(new Field(F_WORD, g, Field.Store.YES,
246: Field.Index.UN_TOKENIZED));
247: if ((++row % mod) == 0) {
248: o.println("\trow=" + row + "/" + word2Nums.size()
249: + " doc= " + doc);
250: mod *= 2;
251: }
252: writer.addDocument(doc);
253: } // else degenerate
254: }
255: o.println("Optimizing..");
256: writer.optimize();
257: writer.close();
258: }
259:
260: /**
261: * Given the 2 maps fills a document for 1 word.
262: */
263: private static int index(Map word2Nums, Map num2Words, String g,
264: Document doc) throws Throwable {
265: List keys = (List) word2Nums.get(g); // get list of key#'s
266: Iterator i2 = keys.iterator();
267:
268: Set already = new TreeSet(); // keep them sorted
269:
270: // pass 1: fill up 'already' with all words
271: while (i2.hasNext()) // for each key#
272: {
273: already.addAll((List) num2Words.get(i2.next())); // get list of words
274: }
275: int num = 0;
276: already.remove(g); // of course a word is it's own syn
277: Iterator it = already.iterator();
278: while (it.hasNext()) {
279: String cur = (String) it.next();
280: // don't store things like 'pit bull' -> 'american pit bull'
281: if (!isDecent(cur)) {
282: continue;
283: }
284: num++;
285: doc.add(new Field(F_SYN, cur, Field.Store.YES,
286: Field.Index.NO));
287: }
288: return num;
289: }
290:
291: /**
292: *
293: */
294: private static void usage() {
295: o
296: .println("\n\n"
297: + "java org.apache.lucene.wordnet.Syns2Index <prolog file> <index dir>\n\n");
298: }
299:
300: }
|