001: package org.apache.lucene.index.memory;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.io.InputStream;
022: import java.nio.ByteBuffer;
023: import java.nio.charset.Charset;
024: import java.util.ArrayList;
025: import java.util.Arrays;
026: import java.util.HashMap;
027: import java.util.Iterator;
028: import java.util.Map;
029: import java.util.TreeMap;
030: import java.util.TreeSet;
031:
032: /**
033: * Loads the <a target="_blank"
034: * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
035: * href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
036: * into a thread-safe main-memory hash map that can be used for fast
037: * high-frequency lookups of synonyms for any given (lowercase) word string.
038: * <p>
039: * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
040: * There does not necessarily hold: A -> B, B -> C then A -> C.
041: * <p>
042: * Loading typically takes some 1.5 secs, so should be done only once per
043: * (server) program execution, using a singleton pattern. Once loaded, a
044: * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
045: * A loaded default synonym map consumes about 10 MB main memory.
046: * An instance is immutable, hence thread-safe.
047: * <p>
048: * This implementation borrows some ideas from the Lucene Syns2Index demo that
049: * Dave Spencer originally contributed to Lucene. Dave's approach
050: * involved a persistent Lucene index which is suitable for occasional
051: * lookups or very large synonym tables, but considered unsuitable for
052: * high-frequency lookups of medium size synonym tables.
053: * <p>
054: * Example Usage:
055: * <pre>
056: * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
057: * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
058: * for (int i = 0; i < words.length; i++) {
059: * String[] synonyms = map.getSynonyms(words[i]);
060: * System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
061: * }
062: *
063: * Example output:
064: * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
065: * woods:[forest, wood]
066: * forest:[afforest, timber, timberland, wood, woodland, woods]
067: * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
068: * xxxx:[]
069: * </pre>
070: *
071: * @author whoschek.AT.lbl.DOT.gov
072: * @see <a target="_blank"
073: * href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
074: * man page </a>
075: * @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
076: */
077: public class SynonymMap {
078:
079: /** the index data; Map<String word, String[] synonyms> */
080: private final HashMap table;
081:
082: private static final String[] EMPTY = new String[0];
083:
084: private static final boolean DEBUG = false;
085:
086: /**
087: * Constructs an instance, loading WordNet synonym data from the given input
088: * stream. Finally closes the stream. The words in the stream must be in
089: * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
090: *
091: * @param input
092: * the stream to read from (null indicates an empty synonym map)
093: * @throws IOException
094: * if an error occured while reading the stream.
095: */
096: public SynonymMap(InputStream input) throws IOException {
097: this .table = input == null ? new HashMap(0)
098: : read(toByteArray(input));
099: }
100:
101: /**
102: * Returns the synonym set for the given word, sorted ascending.
103: *
104: * @param word
105: * the word to lookup (must be in lowercase).
106: * @return the synonyms; a set of zero or more words, sorted ascending, each
107: * word containing lowercase characters that satisfy
108: * <code>Character.isLetter()</code>.
109: */
110: public String[] getSynonyms(String word) {
111: Object syns = table.get(word);
112: if (syns == null)
113: return EMPTY;
114: if (syns instanceof String)
115: return new String[] { (String) syns };
116:
117: String[] synonyms = (String[]) syns;
118: String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
119: System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
120: return copy;
121: }
122:
123: /**
124: * Returns a String representation of the index data for debugging purposes.
125: *
126: * @return a String representation
127: */
128: public String toString() {
129: StringBuffer buf = new StringBuffer();
130: Iterator iter = new TreeMap(table).keySet().iterator();
131: int count = 0;
132: int f0 = 0;
133: int f1 = 0;
134: int f2 = 0;
135: int f3 = 0;
136:
137: while (iter.hasNext()) {
138: String word = (String) iter.next();
139: buf.append(word + ":");
140: String[] synonyms = getSynonyms(word);
141: buf.append(Arrays.asList(synonyms));
142: buf.append("\n");
143: count += synonyms.length;
144: if (synonyms.length == 0)
145: f0++;
146: if (synonyms.length == 1)
147: f1++;
148: if (synonyms.length == 2)
149: f2++;
150: if (synonyms.length == 3)
151: f3++;
152: }
153:
154: buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count
155: + ", f0=" + f0 + ", f1=" + f1 + ", f2=" + f2 + ", f3="
156: + f3);
157: return buf.toString();
158: }
159:
160: /**
161: * Analyzes/transforms the given word on input stream loading. This default implementation simply
162: * lowercases the word. Override this method with a custom stemming
163: * algorithm or similar, if desired.
164: *
165: * @param word
166: * the word to analyze
167: * @return the same word, or a different word (or null to indicate that the
168: * word should be ignored)
169: */
170: protected String analyze(String word) {
171: return word.toLowerCase();
172: }
173:
174: private static boolean isValid(String str) {
175: for (int i = str.length(); --i >= 0;) {
176: if (!Character.isLetter(str.charAt(i)))
177: return false;
178: }
179: return true;
180: }
181:
182: private HashMap read(byte[] data) {
183: int WORDS = (int) (76401 / 0.7); // presizing
184: int GROUPS = (int) (88022 / 0.7); // presizing
185: HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
186: HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
187: HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
188:
189: Charset charset = Charset.forName("UTF-8");
190: int lastNum = -1;
191: Integer lastGroup = null;
192: int len = data.length;
193: int i = 0;
194:
195: while (i < len) { // until EOF
196: /* Part A: Parse a line */
197:
198: // scan to beginning of group
199: while (i < len && data[i] != '(')
200: i++;
201: if (i >= len)
202: break; // EOF
203: i++;
204:
205: // parse group
206: int num = 0;
207: while (i < len && data[i] != ',') {
208: num = 10 * num + (data[i] - 48);
209: i++;
210: }
211: i++;
212: // if (DEBUG) System.err.println("num="+ num);
213:
214: // scan to beginning of word
215: while (i < len && data[i] != '\'')
216: i++;
217: i++;
218:
219: // scan to end of word
220: int start = i;
221: do {
222: while (i < len && data[i] != '\'')
223: i++;
224: i++;
225: } while (i < len && data[i] != ','); // word must end with "',"
226:
227: if (i >= len)
228: break; // EOF
229: String word = charset.decode(
230: ByteBuffer.wrap(data, start, i - start - 1))
231: .toString();
232: // String word = new String(data, 0, start, i-start-1); // ASCII
233:
234: /*
235: * Part B: ignore phrases (with spaces and hyphens) and
236: * non-alphabetic words, and let user customize word (e.g. do some
237: * stemming)
238: */
239: if (!isValid(word))
240: continue; // ignore
241: word = analyze(word);
242: if (word == null || word.length() == 0)
243: continue; // ignore
244:
245: /* Part C: Add (group,word) to tables */
246:
247: // ensure compact string representation, minimizing memory overhead
248: String w = (String) internedWords.get(word);
249: if (w == null) {
250: word = new String(word); // ensure compact string
251: internedWords.put(word, word);
252: } else {
253: word = w;
254: }
255:
256: Integer group = lastGroup;
257: if (num != lastNum) {
258: group = new Integer(num);
259: lastGroup = group;
260: lastNum = num;
261: }
262:
263: // add word --> group
264: ArrayList groups = (ArrayList) word2Groups.get(word);
265: if (groups == null) {
266: groups = new ArrayList(1);
267: word2Groups.put(word, groups);
268: }
269: groups.add(group);
270:
271: // add group --> word
272: ArrayList words = (ArrayList) group2Words.get(group);
273: if (words == null) {
274: words = new ArrayList(1);
275: group2Words.put(group, words);
276: }
277: words.add(word);
278: }
279:
280: /* Part D: compute index data structure */
281: HashMap word2Syns = createIndex(word2Groups, group2Words);
282:
283: /* Part E: minimize memory consumption by a factor 3 (or so) */
284: // if (true) return word2Syns;
285: word2Groups = null; // help gc
286: group2Words = null; // help gc
287: return optimize(word2Syns, internedWords);
288: }
289:
290: private HashMap createIndex(Map word2Groups, Map group2Words) {
291: HashMap word2Syns = new HashMap();
292: Iterator iter = word2Groups.entrySet().iterator();
293:
294: while (iter.hasNext()) { // for each word
295: Map.Entry entry = (Map.Entry) iter.next();
296: ArrayList group = (ArrayList) entry.getValue();
297: String word = (String) entry.getKey();
298:
299: // HashSet synonyms = new HashSet();
300: TreeSet synonyms = new TreeSet();
301: for (int i = group.size(); --i >= 0;) { // for each groupID of word
302: ArrayList words = (ArrayList) group2Words.get(group
303: .get(i));
304: for (int j = words.size(); --j >= 0;) { // add all words
305: Object synonym = words.get(j); // note that w and word are interned
306: if (synonym != word) { // a word is implicitly it's own synonym
307: synonyms.add(synonym);
308: }
309: }
310: }
311:
312: int size = synonyms.size();
313: if (size > 0) {
314: String[] syns = new String[size];
315: if (size == 1)
316: syns[0] = (String) synonyms.first();
317: else
318: synonyms.toArray(syns);
319: // if (syns.length > 1) Arrays.sort(syns);
320: // if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
321: word2Syns.put(word, syns);
322: }
323: }
324:
325: return word2Syns;
326: }
327:
328: private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
329: if (DEBUG) {
330: System.err.println("before gc");
331: for (int i = 0; i < 10; i++)
332: System.gc();
333: System.err.println("after gc");
334: }
335:
336: // collect entries
337: int len = 0;
338: int size = word2Syns.size();
339: String[][] allSynonyms = new String[size][];
340: String[] words = new String[size];
341: Iterator iter = word2Syns.entrySet().iterator();
342: for (int j = 0; j < size; j++) {
343: Map.Entry entry = (Map.Entry) iter.next();
344: allSynonyms[j] = (String[]) entry.getValue();
345: words[j] = (String) entry.getKey();
346: len += words[j].length();
347: }
348:
349: // assemble large string containing all words
350: StringBuffer buf = new StringBuffer(len);
351: for (int j = 0; j < size; j++)
352: buf.append(words[j]);
353: String allWords = new String(buf.toString()); // ensure compact string across JDK versions
354: buf = null;
355:
356: // intern words at app level via memory-overlaid substrings
357: for (int p = 0, j = 0; j < size; j++) {
358: String word = words[j];
359: internedWords.put(word, allWords.substring(p, p
360: + word.length()));
361: p += word.length();
362: }
363:
364: // replace words with interned words
365: for (int j = 0; j < size; j++) {
366: String[] syns = allSynonyms[j];
367: for (int k = syns.length; --k >= 0;) {
368: syns[k] = (String) internedWords.get(syns[k]);
369: }
370: Object replacement = syns;
371: if (syns.length == 1)
372: replacement = syns[0]; // minimize memory consumption some more
373: word2Syns.remove(words[j]);
374: word2Syns.put(internedWords.get(words[j]), replacement);
375: }
376:
377: if (DEBUG) {
378: words = null;
379: allSynonyms = null;
380: internedWords = null;
381: allWords = null;
382: System.err.println("before gc");
383: for (int i = 0; i < 10; i++)
384: System.gc();
385: System.err.println("after gc");
386: }
387: return word2Syns;
388: }
389:
390: // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
391: private static byte[] toByteArray(InputStream input)
392: throws IOException {
393: try {
394: // safe and fast even if input.available() behaves weird or buggy
395: int len = Math.max(256, input.available());
396: byte[] buffer = new byte[len];
397: byte[] output = new byte[len];
398:
399: len = 0;
400: int n;
401: while ((n = input.read(buffer)) >= 0) {
402: if (len + n > output.length) { // grow capacity
403: byte tmp[] = new byte[Math.max(output.length << 1,
404: len + n)];
405: System.arraycopy(output, 0, tmp, 0, len);
406: System.arraycopy(buffer, 0, tmp, len, n);
407: buffer = output; // use larger buffer for future larger bulk reads
408: output = tmp;
409: } else {
410: System.arraycopy(buffer, 0, output, len, n);
411: }
412: len += n;
413: }
414:
415: if (len == output.length)
416: return output;
417: buffer = null; // help gc
418: buffer = new byte[len];
419: System.arraycopy(output, 0, buffer, 0, len);
420: return buffer;
421: } finally {
422: if (input != null)
423: input.close();
424: }
425: }
426:
427: }
|