001: package org.apache.lucene.index.memory;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.io.PrintStream;
022: import java.io.Reader;
023: import java.io.StringReader;
024: import java.util.ArrayList;
025: import java.util.Arrays;
026: import java.util.Comparator;
027: import java.util.HashMap;
028: import java.util.Iterator;
029: import java.util.Map;
030: import java.util.regex.Pattern;
031:
032: import org.apache.lucene.analysis.Analyzer;
033: import org.apache.lucene.analysis.PorterStemFilter;
034: import org.apache.lucene.analysis.Token;
035: import org.apache.lucene.analysis.TokenFilter;
036: import org.apache.lucene.analysis.TokenStream;
037:
038: /**
039: * Various fulltext analysis utilities avoiding redundant code in several
040: * classes.
041: *
042: * @author whoschek.AT.lbl.DOT.gov
043: */
044: public class AnalyzerUtil {
045:
046: private AnalyzerUtil() {
047: };
048:
049: /**
050: * Returns a simple analyzer wrapper that logs all tokens produced by the
051: * underlying child analyzer to the given log stream (typically System.err);
052: * Otherwise behaves exactly like the child analyzer, delivering the very
053: * same tokens; useful for debugging purposes on custom indexing and/or
054: * querying.
055: *
056: * @param child
057: * the underlying child analyzer
058: * @param log
059: * the print stream to log to (typically System.err)
060: * @param logName
061: * a name for this logger (typically "log" or similar)
062: * @return a logging analyzer
063: */
064: public static Analyzer getLoggingAnalyzer(final Analyzer child,
065: final PrintStream log, final String logName) {
066:
067: if (child == null)
068: throw new IllegalArgumentException(
069: "child analyzer must not be null");
070: if (log == null)
071: throw new IllegalArgumentException(
072: "logStream must not be null");
073:
074: return new Analyzer() {
075: public TokenStream tokenStream(final String fieldName,
076: Reader reader) {
077: return new TokenFilter(child.tokenStream(fieldName,
078: reader)) {
079: private int position = -1;
080:
081: public Token next() throws IOException {
082: Token token = input.next(); // from filter super class
083: log.println(toString(token));
084: return token;
085: }
086:
087: private String toString(Token token) {
088: if (token == null)
089: return "[" + logName + ":EOS:" + fieldName
090: + "]\n";
091:
092: position += token.getPositionIncrement();
093: return "[" + logName + ":" + position + ":"
094: + fieldName + ":" + token.termText()
095: + ":" + token.startOffset() + "-"
096: + token.endOffset() + ":"
097: + token.type() + "]";
098: }
099: };
100: }
101: };
102: }
103:
104: /**
105: * Returns an analyzer wrapper that returns at most the first
106: * <code>maxTokens</code> tokens from the underlying child analyzer,
107: * ignoring all remaining tokens.
108: *
109: * @param child
110: * the underlying child analyzer
111: * @param maxTokens
112: * the maximum number of tokens to return from the underlying
113: * analyzer (a value of Integer.MAX_VALUE indicates unlimited)
114: * @return an analyzer wrapper
115: */
116: public static Analyzer getMaxTokenAnalyzer(final Analyzer child,
117: final int maxTokens) {
118:
119: if (child == null)
120: throw new IllegalArgumentException(
121: "child analyzer must not be null");
122: if (maxTokens < 0)
123: throw new IllegalArgumentException(
124: "maxTokens must not be negative");
125: if (maxTokens == Integer.MAX_VALUE)
126: return child; // no need to wrap
127:
128: return new Analyzer() {
129: public TokenStream tokenStream(String fieldName,
130: Reader reader) {
131: return new TokenFilter(child.tokenStream(fieldName,
132: reader)) {
133: private int todo = maxTokens;
134:
135: public Token next() throws IOException {
136: return --todo >= 0 ? input.next() : null;
137: }
138: };
139: }
140: };
141: }
142:
143: /**
144: * Returns an English stemming analyzer that stems tokens from the
145: * underlying child analyzer according to the Porter stemming algorithm. The
146: * child analyzer must deliver tokens in lower case for the stemmer to work
147: * properly.
148: * <p>
149: * Background: Stemming reduces token terms to their linguistic root form
150: * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
151: * "famili", as well as "complete" and "completion" to "complet". Note that
152: * the root form is not necessarily a meaningful word in itself, and that
153: * this is not a bug but rather a feature, if you lean back and think about
154: * fuzzy word matching for a bit.
155: * <p>
156: * See the Lucene contrib packages for stemmers (and stop words) for German,
157: * Russian and many more languages.
158: *
159: * @param child
160: * the underlying child analyzer
161: * @return an analyzer wrapper
162: */
163: public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
164:
165: if (child == null)
166: throw new IllegalArgumentException(
167: "child analyzer must not be null");
168:
169: return new Analyzer() {
170: public TokenStream tokenStream(String fieldName,
171: Reader reader) {
172: return new PorterStemFilter(child.tokenStream(
173: fieldName, reader));
174: // /* PorterStemFilter and SnowballFilter have the same behaviour,
175: // but PorterStemFilter is much faster. */
176: // return new org.apache.lucene.analysis.snowball.SnowballFilter(
177: // child.tokenStream(fieldName, reader), "English");
178: }
179: };
180: }
181:
182: /**
183: * Returns an analyzer wrapper that wraps the underlying child analyzer's
184: * token stream into a {@link SynonymTokenFilter}.
185: *
186: * @param child
187: * the underlying child analyzer
188: * @param synonyms
189: * the map used to extract synonyms for terms
190: * @param maxSynonyms
191: * the maximum number of synonym tokens to return per underlying
192: * token word (a value of Integer.MAX_VALUE indicates unlimited)
193: * @return a new analyzer
194: */
195: public static Analyzer getSynonymAnalyzer(final Analyzer child,
196: final SynonymMap synonyms, final int maxSynonyms) {
197:
198: if (child == null)
199: throw new IllegalArgumentException(
200: "child analyzer must not be null");
201: if (synonyms == null)
202: throw new IllegalArgumentException(
203: "synonyms must not be null");
204: if (maxSynonyms < 0)
205: throw new IllegalArgumentException(
206: "maxSynonyms must not be negative");
207: if (maxSynonyms == 0)
208: return child; // no need to wrap
209:
210: return new Analyzer() {
211: public TokenStream tokenStream(String fieldName,
212: Reader reader) {
213: return new SynonymTokenFilter(child.tokenStream(
214: fieldName, reader), synonyms, maxSynonyms);
215: }
216: };
217: }
218:
219: /**
220: * Returns an analyzer wrapper that caches all tokens generated by the underlying child analyzer's
221: * token streams, and delivers those cached tokens on subsequent calls to
222: * <code>tokenStream(String fieldName, Reader reader)</code>
223: * if the fieldName has been seen before, altogether ignoring the Reader parameter on cache lookup.
224: * <p>
225: * If Analyzer / TokenFilter chains are expensive in terms of I/O or CPU, such caching can
226: * help improve performance if the same document is added to multiple Lucene indexes,
227: * because the text analysis phase need not be performed more than once.
228: * <p>
229: * Caveats:
230: * <ul>
231: * <li>Caching the tokens of large Lucene documents can lead to out of memory exceptions.</li>
232: * <li>The Token instances delivered by the underlying child analyzer must be immutable.</li>
233: * <li>The same caching analyzer instance must not be used for more than one document
234: * because the cache is not keyed on the Reader parameter.</li>
235: * </ul>
236: *
237: * @param child
238: * the underlying child analyzer
239: * @return a new analyzer
240: */
241: public static Analyzer getTokenCachingAnalyzer(final Analyzer child) {
242:
243: if (child == null)
244: throw new IllegalArgumentException(
245: "child analyzer must not be null");
246:
247: return new Analyzer() {
248:
249: private final HashMap cache = new HashMap();
250:
251: public TokenStream tokenStream(String fieldName,
252: Reader reader) {
253: final ArrayList tokens = (ArrayList) cache
254: .get(fieldName);
255: if (tokens == null) { // not yet cached
256: final ArrayList tokens2 = new ArrayList();
257: TokenStream tokenStream = new TokenFilter(child
258: .tokenStream(fieldName, reader)) {
259:
260: public Token next() throws IOException {
261: Token token = input.next(); // from filter super class
262: if (token != null)
263: tokens2.add(token);
264: return token;
265: }
266: };
267:
268: cache.put(fieldName, tokens2);
269: return tokenStream;
270: } else { // already cached
271: return new TokenStream() {
272:
273: private Iterator iter = tokens.iterator();
274:
275: public Token next() {
276: if (!iter.hasNext())
277: return null;
278: return (Token) iter.next();
279: }
280: };
281: }
282: }
283: };
284: }
285:
286: /**
287: * Returns (frequency:term) pairs for the top N distinct terms (aka words),
288: * sorted descending by frequency (and ascending by term, if tied).
289: * <p>
290: * Example XQuery:
291: * <pre>
292: * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
293: * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
294: *
295: * for $pair in util:get-most-frequent-terms(
296: * analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
297: * return <word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
298: * </pre>
299: *
300: * @param analyzer
301: * the analyzer to use for splitting text into terms (aka words)
302: * @param text
303: * the text to analyze
304: * @param limit
305: * the maximum number of pairs to return; zero indicates
306: * "as many as possible".
307: * @return an array of (frequency:term) pairs in the form of (freq0:term0,
308: * freq1:term1, ..., freqN:termN). Each pair is a single string
309: * separated by a ':' delimiter.
310: */
311: public static String[] getMostFrequentTerms(Analyzer analyzer,
312: String text, int limit) {
313: if (analyzer == null)
314: throw new IllegalArgumentException(
315: "analyzer must not be null");
316: if (text == null)
317: throw new IllegalArgumentException("text must not be null");
318: if (limit <= 0)
319: limit = Integer.MAX_VALUE;
320:
321: // compute frequencies of distinct terms
322: HashMap map = new HashMap();
323: TokenStream stream = analyzer.tokenStream("", new StringReader(
324: text));
325: try {
326: Token token;
327: while ((token = stream.next()) != null) {
328: MutableInteger freq = (MutableInteger) map.get(token
329: .termText());
330: if (freq == null) {
331: freq = new MutableInteger(1);
332: map.put(token.termText(), freq);
333: } else {
334: freq.setValue(freq.intValue() + 1);
335: }
336: }
337: } catch (IOException e) {
338: throw new RuntimeException(e);
339: } finally {
340: try {
341: stream.close();
342: } catch (IOException e2) {
343: throw new RuntimeException(e2);
344: }
345: }
346:
347: // sort by frequency, text
348: Map.Entry[] entries = new Map.Entry[map.size()];
349: map.entrySet().toArray(entries);
350: Arrays.sort(entries, new Comparator() {
351: public int compare(Object o1, Object o2) {
352: Map.Entry e1 = (Map.Entry) o1;
353: Map.Entry e2 = (Map.Entry) o2;
354: int f1 = ((MutableInteger) e1.getValue()).intValue();
355: int f2 = ((MutableInteger) e2.getValue()).intValue();
356: if (f2 - f1 != 0)
357: return f2 - f1;
358: String s1 = (String) e1.getKey();
359: String s2 = (String) e2.getKey();
360: return s1.compareTo(s2);
361: }
362: });
363:
364: // return top N entries
365: int size = Math.min(limit, entries.length);
366: String[] pairs = new String[size];
367: for (int i = 0; i < size; i++) {
368: pairs[i] = entries[i].getValue() + ":"
369: + entries[i].getKey();
370: }
371: return pairs;
372: }
373:
374: private static final class MutableInteger {
375: private int value;
376:
377: public MutableInteger(int value) {
378: this .value = value;
379: }
380:
381: public int intValue() {
382: return value;
383: }
384:
385: public void setValue(int value) {
386: this .value = value;
387: }
388:
389: public String toString() {
390: return String.valueOf(value);
391: }
392: };
393:
394: // TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
395: /** (Line terminator followed by zero or more whitespace) two or more times */
396: private static final Pattern PARAGRAPHS = Pattern
397: .compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
398:
399: /**
400: * Returns at most the first N paragraphs of the given text. Delimiting
401: * characters are excluded from the results. Each returned paragraph is
402: * whitespace-trimmed via String.trim(), potentially an empty string.
403: *
404: * @param text
405: * the text to tokenize into paragraphs
406: * @param limit
407: * the maximum number of paragraphs to return; zero indicates "as
408: * many as possible".
409: * @return the first N paragraphs
410: */
411: public static String[] getParagraphs(String text, int limit) {
412: return tokenize(PARAGRAPHS, text, limit);
413: }
414:
415: private static String[] tokenize(Pattern pattern, String text,
416: int limit) {
417: String[] tokens = pattern.split(text, limit);
418: for (int i = tokens.length; --i >= 0;)
419: tokens[i] = tokens[i].trim();
420: return tokens;
421: }
422:
423: // TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
424: /** Divides text into sentences; Includes inverted spanish exclamation and question mark */
425: private static final Pattern SENTENCES = Pattern
426: .compile("[!\\.\\?\\xA1\\xBF]+");
427:
428: /**
429: * Returns at most the first N sentences of the given text. Delimiting
430: * characters are excluded from the results. Each returned sentence is
431: * whitespace-trimmed via String.trim(), potentially an empty string.
432: *
433: * @param text
434: * the text to tokenize into sentences
435: * @param limit
436: * the maximum number of sentences to return; zero indicates "as
437: * many as possible".
438: * @return the first N sentences
439: */
440: public static String[] getSentences(String text, int limit) {
441: // return tokenize(SENTENCES, text, limit); // equivalent but slower
442: int len = text.length();
443: if (len == 0)
444: return new String[] { text };
445: if (limit <= 0)
446: limit = Integer.MAX_VALUE;
447:
448: // average sentence length heuristic
449: String[] tokens = new String[Math.min(limit, 1 + len / 40)];
450: int size = 0;
451: int i = 0;
452:
453: while (i < len && size < limit) {
454:
455: // scan to end of current sentence
456: int start = i;
457: while (i < len && !isSentenceSeparator(text.charAt(i)))
458: i++;
459:
460: if (size == tokens.length) { // grow array
461: String[] tmp = new String[tokens.length << 1];
462: System.arraycopy(tokens, 0, tmp, 0, size);
463: tokens = tmp;
464: }
465: // add sentence (potentially empty)
466: tokens[size++] = text.substring(start, i).trim();
467:
468: // scan to beginning of next sentence
469: while (i < len && isSentenceSeparator(text.charAt(i)))
470: i++;
471: }
472:
473: if (size == tokens.length)
474: return tokens;
475: String[] results = new String[size];
476: System.arraycopy(tokens, 0, results, 0, size);
477: return results;
478: }
479:
480: private static boolean isSentenceSeparator(char c) {
481: // regex [!\\.\\?\\xA1\\xBF]
482: switch (c) {
483: case '!':
484: return true;
485: case '.':
486: return true;
487: case '?':
488: return true;
489: case 0xA1:
490: return true; // spanish inverted exclamation mark
491: case 0xBF:
492: return true; // spanish inverted question mark
493: default:
494: return false;
495: }
496: }
497:
498: }
|