Source Code Cross Referenced for Syns2Index.java in » Net » lucene-connector » org » apache » lucene » wordnet » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Net » lucene connector » org.apache.lucene.wordnet

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        package org.apache.lucene.wordnet;
002:
003:        /**
004:         * Licensed to the Apache Software Foundation (ASF) under one or more
005:         * contributor license agreements.  See the NOTICE file distributed with
006:         * this work for additional information regarding copyright ownership.
007:         * The ASF licenses this file to You under the Apache License, Version 2.0
008:         * (the "License"); you may not use this file except in compliance with
009:         * the License.  You may obtain a copy of the License at
010:         *
011:         *     http://www.apache.org/licenses/LICENSE-2.0
012:         *
013:         * Unless required by applicable law or agreed to in writing, software
014:         * distributed under the License is distributed on an "AS IS" BASIS,
015:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016:         * See the License for the specific language governing permissions and
017:         * limitations under the License.
018:         */
019:
020:        import java.io.BufferedReader;
021:        import java.io.File;
022:        import java.io.FileInputStream;
023:        import java.io.InputStreamReader;
024:        import java.io.PrintStream;
025:        import java.util.Iterator;
026:        import java.util.LinkedList;
027:        import java.util.List;
028:        import java.util.Map;
029:        import java.util.Set;
030:        import java.util.TreeMap;
031:        import java.util.TreeSet;
032:
033:        import org.apache.lucene.analysis.Analyzer;
034:        import org.apache.lucene.analysis.standard.StandardAnalyzer;
035:        import org.apache.lucene.document.Document;
036:        import org.apache.lucene.document.Field;
037:        import org.apache.lucene.index.IndexWriter;
038:
039:        /**
040:         * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
041:         * into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
042:         *
043:         * This has been tested with WordNet 2.0.
044:         *
045:         * The index has fields named "word" ({@link #F_WORD})
046:         * and "syn" ({@link #F_SYN}).
047:         * <p>
048:         * The source word (such as 'big') can be looked up in the
049:         * "word" field, and if present there will be fields named "syn"
050:         * for every synonym. What's tricky here is that there could be <b>multiple</b>
051:         * fields with the same name, in the general case for words that have multiple synonyms.
052:         * That's not a problem with Lucene, you just use {@link org.apache.lucene.document.Document#getValues}
053:         * </p>
054:         * <p>
055:         * While the WordNet file distinguishes groups of synonyms with
056:         * related meanings we don't do that here.
057:         * </p>
058:         *
059:         * This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
060:         *
061:         * @author Dave Spencer, dave&#064;searchmorph.com
062:         * @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
063:         * @see <a href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb man page</a>
064:         * @see <a href="http://www.hostmon.com/rfc/advanced.jsp">sample site that uses it</a>
065:         */
066:        public class Syns2Index {
067:            /**
068:             *
069:             */
070:            private static final PrintStream o = System.out;
071:
072:            /**
073:             *
074:             */
075:            private static final PrintStream err = System.err;
076:
077:            /**
078:             *
079:             */
080:            public static final String F_SYN = "syn";
081:
082:            /**
083:             *
084:             */
085:            public static final String F_WORD = "word";
086:
087:            /**
088:             *
089:             */
090:            private static final Analyzer ana = new StandardAnalyzer();
091:
092:            /**
093:             * Takes arg of prolog file name and index directory.
094:             */
095:            public static void main(String[] args) throws Throwable {
096:                // get command line arguments
097:                String prologFilename = null; // name of file "wn_s.pl"
098:                String indexDir = null;
099:                if (args.length == 2) {
100:                    prologFilename = args[0];
101:                    indexDir = args[1];
102:                } else {
103:                    usage();
104:                    System.exit(1);
105:                }
106:
107:                // ensure that the prolog file is readable
108:                if (!(new File(prologFilename)).canRead()) {
109:                    err.println("Error: cannot read Prolog file: "
110:                            + prologFilename);
111:                    System.exit(1);
112:                }
113:                // exit if the target index directory already exists
114:                if ((new File(indexDir)).isDirectory()) {
115:                    err.println("Error: index directory already exists: "
116:                            + indexDir);
117:                    err
118:                            .println("Please specify a name of a non-existent directory");
119:                    System.exit(1);
120:                }
121:
122:                o.println("Opening Prolog file " + prologFilename);
123:                final FileInputStream fis = new FileInputStream(prologFilename);
124:                final BufferedReader br = new BufferedReader(
125:                        new InputStreamReader(fis));
126:                String line;
127:
128:                // maps a word to all the "groups" it's in
129:                final Map word2Nums = new TreeMap();
130:                // maps a group to all the words in it
131:                final Map num2Words = new TreeMap();
132:                // number of rejected words
133:                int ndecent = 0;
134:
135:                // status output
136:                int mod = 1;
137:                int row = 1;
138:                // parse prolog file
139:                o.println("[1/2] Parsing " + prologFilename);
140:                while ((line = br.readLine()) != null) {
141:                    // occasional progress
142:                    if ((++row) % mod == 0) // periodically print out line we read in
143:                    {
144:                        mod *= 2;
145:                        o.println("\t" + row + " " + line + " "
146:                                + word2Nums.size() + " " + num2Words.size()
147:                                + " ndecent=" + ndecent);
148:                    }
149:
150:                    // syntax check
151:                    if (!line.startsWith("s(")) {
152:                        err.println("OUCH: " + line);
153:                        System.exit(1);
154:                    }
155:
156:                    // parse line
157:                    line = line.substring(2);
158:                    int comma = line.indexOf(',');
159:                    String num = line.substring(0, comma);
160:                    int q1 = line.indexOf('\'');
161:                    line = line.substring(q1 + 1);
162:                    int q2 = line.indexOf('\'');
163:                    String word = line.substring(0, q2).toLowerCase();
164:
165:                    // make sure is a normal word
166:                    if (!isDecent(word)) {
167:                        ndecent++;
168:                        continue; // don't store words w/ spaces
169:                    }
170:
171:                    // 1/2: word2Nums map
172:                    // append to entry or add new one
173:                    List lis = (List) word2Nums.get(word);
174:                    if (lis == null) {
175:                        lis = new LinkedList();
176:                        lis.add(num);
177:                        word2Nums.put(word, lis);
178:                    } else
179:                        lis.add(num);
180:
181:                    // 2/2: num2Words map
182:                    lis = (List) num2Words.get(num);
183:                    if (lis == null) {
184:                        lis = new LinkedList();
185:                        lis.add(word);
186:                        num2Words.put(num, lis);
187:                    } else
188:                        lis.add(word);
189:                }
190:
191:                // close the streams
192:                fis.close();
193:                br.close();
194:
195:                // create the index
196:                o.println("[2/2] Building index to store synonyms, "
197:                        + " map sizes are " + word2Nums.size() + " and "
198:                        + num2Words.size());
199:                index(indexDir, word2Nums, num2Words);
200:            }
201:
202:            /**
203:             * Checks to see if a word contains only alphabetic characters by
204:             * checking it one character at a time.
205:             *
206:             * @param s string to check
207:             * @return <code>true</code> if the string is decent
208:             */
209:            private static boolean isDecent(String s) {
210:                int len = s.length();
211:                for (int i = 0; i < len; i++) {
212:                    if (!Character.isLetter(s.charAt(i))) {
213:                        return false;
214:                    }
215:                }
216:                return true;
217:            }
218:
219:            /**
220:             * Forms a Lucene index based on the 2 maps.
221:             *
222:             * @param indexDir the direcotry where the index should be created
223:             * @param word2Nums
224:             * @param num2Words
225:             */
226:            private static void index(String indexDir, Map word2Nums,
227:                    Map num2Words) throws Throwable {
228:                int row = 0;
229:                int mod = 1;
230:
231:                // override the specific index if it already exists
232:                IndexWriter writer = new IndexWriter(indexDir, ana, true);
233:                writer.setUseCompoundFile(true); // why?
234:                // blindly up these parameters for speed
235:                writer.setMergeFactor(writer.getMergeFactor() * 2);
236:                writer.setMaxBufferedDocs(writer.getMaxBufferedDocs() * 2);
237:                Iterator i1 = word2Nums.keySet().iterator();
238:                while (i1.hasNext()) // for each word
239:                {
240:                    String g = (String) i1.next();
241:                    Document doc = new Document();
242:
243:                    int n = index(word2Nums, num2Words, g, doc);
244:                    if (n > 0) {
245:                        doc.add(new Field(F_WORD, g, Field.Store.YES,
246:                                Field.Index.UN_TOKENIZED));
247:                        if ((++row % mod) == 0) {
248:                            o.println("\trow=" + row + "/" + word2Nums.size()
249:                                    + " doc= " + doc);
250:                            mod *= 2;
251:                        }
252:                        writer.addDocument(doc);
253:                    } // else degenerate
254:                }
255:                o.println("Optimizing..");
256:                writer.optimize();
257:                writer.close();
258:            }
259:
260:            /**
261:             * Given the 2 maps fills a document for 1 word.
262:             */
263:            private static int index(Map word2Nums, Map num2Words, String g,
264:                    Document doc) throws Throwable {
265:                List keys = (List) word2Nums.get(g); // get list of key#'s
266:                Iterator i2 = keys.iterator();
267:
268:                Set already = new TreeSet(); // keep them sorted
269:
270:                // pass 1: fill up 'already' with all words
271:                while (i2.hasNext()) // for each key#
272:                {
273:                    already.addAll((List) num2Words.get(i2.next())); // get list of words
274:                }
275:                int num = 0;
276:                already.remove(g); // of course a word is it's own syn
277:                Iterator it = already.iterator();
278:                while (it.hasNext()) {
279:                    String cur = (String) it.next();
280:                    // don't store things like 'pit bull' -> 'american pit bull'
281:                    if (!isDecent(cur)) {
282:                        continue;
283:                    }
284:                    num++;
285:                    doc.add(new Field(F_SYN, cur, Field.Store.YES,
286:                            Field.Index.NO));
287:                }
288:                return num;
289:            }
290:
291:            /**
292:             *
293:             */
294:            private static void usage() {
295:                o
296:                        .println("\n\n"
297:                                + "java org.apache.lucene.wordnet.Syns2Index <prolog file> <index dir>\n\n");
298:            }
299:
300:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.