001: package org.apache.lucene.index.memory;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.io.Reader;
022: import java.io.StringReader;
023: import java.util.Arrays;
024: import java.util.HashSet;
025: import java.util.Locale;
026: import java.util.Set;
027: import java.util.regex.Matcher;
028: import java.util.regex.Pattern;
029:
030: import org.apache.lucene.analysis.Analyzer;
031: import org.apache.lucene.analysis.StopAnalyzer;
032: import org.apache.lucene.analysis.StopFilter;
033: import org.apache.lucene.analysis.Token;
034: import org.apache.lucene.analysis.TokenStream;
035:
036: /**
037: * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
038: * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
039: * (with behaviour identical to {@link String#split(String)}),
040: * and that combines the functionality of
041: * {@link org.apache.lucene.analysis.LetterTokenizer},
042: * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
043: * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
044: * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
045: * multi-purpose class.
046: * <p>
047: * If you are unsure how exactly a regular expression should look like, consider
048: * prototyping by simply trying various expressions on some test texts via
049: * {@link String#split(String)}. Once you are satisfied, give that regex to
050: * PatternAnalyzer. Also see <a target="_blank"
051: * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
052: * <p>
053: * This class can be considerably faster than the "normal" Lucene tokenizers.
054: * It can also serve as a building block in a compound Lucene
055: * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
056: * stemming example:
057: * <pre>
058: * PatternAnalyzer pat = ...
059: * TokenStream tokenStream = new SnowballFilter(
060: * pat.tokenStream("content", "James is running round in the woods"),
061: * "English"));
062: * </pre>
063: *
064: * @author whoschek.AT.lbl.DOT.gov
065: */
066: public class PatternAnalyzer extends Analyzer {
067:
068: /** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
069: public static final Pattern NON_WORD_PATTERN = Pattern
070: .compile("\\W+");
071:
072: /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
073: public static final Pattern WHITESPACE_PATTERN = Pattern
074: .compile("\\s+");
075:
076: private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
077: "a", "about", "above", "across", "adj", "after",
078: "afterwards", "again", "against", "albeit", "all",
079: "almost", "alone", "along", "already", "also", "although",
080: "always", "among", "amongst", "an", "and", "another",
081: "any", "anyhow", "anyone", "anything", "anywhere", "are",
082: "around", "as", "at", "be", "became", "because", "become",
083: "becomes", "becoming", "been", "before", "beforehand",
084: "behind", "being", "below", "beside", "besides", "between",
085: "beyond", "both", "but", "by", "can", "cannot", "co",
086: "could", "down", "during", "each", "eg", "either", "else",
087: "elsewhere", "enough", "etc", "even", "ever", "every",
088: "everyone", "everything", "everywhere", "except", "few",
089: "first", "for", "former", "formerly", "from", "further",
090: "had", "has", "have", "he", "hence", "her", "here",
091: "hereafter", "hereby", "herein", "hereupon", "hers",
092: "herself", "him", "himself", "his", "how", "however", "i",
093: "ie", "if", "in", "inc", "indeed", "into", "is", "it",
094: "its", "itself", "last", "latter", "latterly", "least",
095: "less", "ltd", "many", "may", "me", "meanwhile", "might",
096: "more", "moreover", "most", "mostly", "much", "must", "my",
097: "myself", "namely", "neither", "never", "nevertheless",
098: "next", "no", "nobody", "none", "noone", "nor", "not",
099: "nothing", "now", "nowhere", "of", "off", "often", "on",
100: "once one", "only", "onto", "or", "other", "others",
101: "otherwise", "our", "ours", "ourselves", "out", "over",
102: "own", "per", "perhaps", "rather", "s", "same", "seem",
103: "seemed", "seeming", "seems", "several", "she", "should",
104: "since", "so", "some", "somehow", "someone", "something",
105: "sometime", "sometimes", "somewhere", "still", "such", "t",
106: "than", "that", "the", "their", "them", "themselves",
107: "then", "thence", "there", "thereafter", "thereby",
108: "therefor", "therein", "thereupon", "these", "they",
109: "this", "those", "though", "through", "throughout", "thru",
110: "thus", "to", "together", "too", "toward", "towards",
111: "under", "until", "up", "upon", "us", "very", "via", "was",
112: "we", "well", "were", "what", "whatever", "whatsoever",
113: "when", "whence", "whenever", "whensoever", "where",
114: "whereafter", "whereas", "whereat", "whereby", "wherefrom",
115: "wherein", "whereinto", "whereof", "whereon", "whereto",
116: "whereunto", "whereupon", "wherever", "wherewith",
117: "whether", "which", "whichever", "whichsoever", "while",
118: "whilst", "whither", "who", "whoever", "whole", "whom",
119: "whomever", "whomsoever", "whose", "whosoever", "why",
120: "will", "with", "within", "without", "would", "xsubj",
121: "xcal", "xauthor", "xother ", "xnote", "yet", "you",
122: "your", "yours", "yourself", "yourselves" });
123:
124: /**
125: * A lower-casing word analyzer with English stop words (can be shared
126: * freely across threads without harm); global per class loader.
127: */
128: public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
129: NON_WORD_PATTERN, true,
130: makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
131:
132: /**
133: * A lower-casing word analyzer with <b>extended </b> English stop words
134: * (can be shared freely across threads without harm); global per class
135: * loader. The stop words are borrowed from
136: * http://thomas.loc.gov/home/stopwords.html, see
137: * http://thomas.loc.gov/home/all.about.inquery.html
138: */
139: public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
140: NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
141:
142: private final Pattern pattern;
143: private final boolean toLowerCase;
144: private final Set stopWords;
145:
146: /**
147: * Constructs a new instance with the given parameters.
148: *
149: * @param pattern
150: * a regular expression delimiting tokens
151: * @param toLowerCase
152: * if <code>true</code> returns tokens after applying
153: * String.toLowerCase()
154: * @param stopWords
155: * if non-null, ignores all tokens that are contained in the
156: * given stop set (after previously having applied toLowerCase()
157: * if applicable). For example, created via
158: * {@link StopFilter#makeStopSet(String[])}and/or
159: * {@link org.apache.lucene.analysis.WordlistLoader}as in
160: * <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
161: * or <a href="http://www.unine.ch/info/clef/">other stop words
162: * lists </a>.
163: */
164: public PatternAnalyzer(Pattern pattern, boolean toLowerCase,
165: Set stopWords) {
166: if (pattern == null)
167: throw new IllegalArgumentException(
168: "pattern must not be null");
169:
170: if (eqPattern(NON_WORD_PATTERN, pattern))
171: pattern = NON_WORD_PATTERN;
172: else if (eqPattern(WHITESPACE_PATTERN, pattern))
173: pattern = WHITESPACE_PATTERN;
174:
175: if (stopWords != null && stopWords.size() == 0)
176: stopWords = null;
177:
178: this .pattern = pattern;
179: this .toLowerCase = toLowerCase;
180: this .stopWords = stopWords;
181: }
182:
183: /**
184: * Creates a token stream that tokenizes the given string into token terms
185: * (aka words).
186: *
187: * @param fieldName
188: * the name of the field to tokenize (currently ignored).
189: * @param text
190: * the string to tokenize
191: * @return a new token stream
192: */
193: public TokenStream tokenStream(String fieldName, String text) {
194: // Ideally the Analyzer superclass should have a method with the same signature,
195: // with a default impl that simply delegates to the StringReader flavour.
196: if (text == null)
197: throw new IllegalArgumentException("text must not be null");
198:
199: TokenStream stream;
200: if (pattern == NON_WORD_PATTERN) { // fast path
201: stream = new FastStringTokenizer(text, true, toLowerCase,
202: stopWords);
203: } else if (pattern == WHITESPACE_PATTERN) { // fast path
204: stream = new FastStringTokenizer(text, false, toLowerCase,
205: stopWords);
206: } else {
207: stream = new PatternTokenizer(text, pattern, toLowerCase);
208: if (stopWords != null)
209: stream = new StopFilter(stream, stopWords);
210: }
211:
212: return stream;
213: }
214:
215: /**
216: * Creates a token stream that tokenizes all the text in the given Reader;
217: * This implementation forwards to <code>tokenStream(String, String)</code> and is
218: * less efficient than <code>tokenStream(String, String)</code>.
219: *
220: * @param fieldName
221: * the name of the field to tokenize (currently ignored).
222: * @param reader
223: * the reader delivering the text
224: * @return a new token stream
225: */
226: public TokenStream tokenStream(String fieldName, Reader reader) {
227: if (reader instanceof FastStringReader) { // fast path
228: return tokenStream(fieldName, ((FastStringReader) reader)
229: .getString());
230: }
231:
232: try {
233: String text = toString(reader);
234: return tokenStream(fieldName, text);
235: } catch (IOException e) {
236: throw new RuntimeException(e);
237: }
238: }
239:
240: /**
241: * Indicates whether some other object is "equal to" this one.
242: *
243: * @param other
244: * the reference object with which to compare.
245: * @return true if equal, false otherwise
246: */
247: public boolean equals(Object other) {
248: if (this == other)
249: return true;
250: if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER)
251: return false;
252: if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER)
253: return false;
254:
255: if (other instanceof PatternAnalyzer) {
256: PatternAnalyzer p2 = (PatternAnalyzer) other;
257: return toLowerCase == p2.toLowerCase
258: && eqPattern(pattern, p2.pattern)
259: && eq(stopWords, p2.stopWords);
260: }
261: return false;
262: }
263:
264: /**
265: * Returns a hash code value for the object.
266: *
267: * @return the hash code.
268: */
269: public int hashCode() {
270: if (this == DEFAULT_ANALYZER)
271: return -1218418418; // fast path
272: if (this == EXTENDED_ANALYZER)
273: return 1303507063; // fast path
274:
275: int h = 1;
276: h = 31 * h + pattern.pattern().hashCode();
277: h = 31 * h + pattern.flags();
278: h = 31 * h + (toLowerCase ? 1231 : 1237);
279: h = 31 * h + (stopWords != null ? stopWords.hashCode() : 0);
280: return h;
281: }
282:
283: /** equality where o1 and/or o2 can be null */
284: private static boolean eq(Object o1, Object o2) {
285: return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
286: }
287:
288: /** assumes p1 and p2 are not null */
289: private static boolean eqPattern(Pattern p1, Pattern p2) {
290: return p1 == p2
291: || (p1.flags() == p2.flags() && p1.pattern().equals(
292: p2.pattern()));
293: }
294:
295: /**
296: * Reads until end-of-stream and returns all read chars, finally closes the stream.
297: *
298: * @param input the input stream
299: * @throws IOException if an I/O error occurs while reading the stream
300: */
301: private static String toString(Reader input) throws IOException {
302: try {
303: int len = 256;
304: char[] buffer = new char[len];
305: char[] output = new char[len];
306:
307: len = 0;
308: int n;
309: while ((n = input.read(buffer)) >= 0) {
310: if (len + n > output.length) { // grow capacity
311: char[] tmp = new char[Math.max(output.length << 1,
312: len + n)];
313: System.arraycopy(output, 0, tmp, 0, len);
314: System.arraycopy(buffer, 0, tmp, len, n);
315: buffer = output; // use larger buffer for future larger bulk reads
316: output = tmp;
317: } else {
318: System.arraycopy(buffer, 0, output, len, n);
319: }
320: len += n;
321: }
322:
323: return new String(output, 0, len);
324: } finally {
325: if (input != null)
326: input.close();
327: }
328: }
329:
330: /** somewhat oversized to minimize hash collisions */
331: private static Set makeStopSet(String[] stopWords) {
332: Set stops = new HashSet(stopWords.length * 2, 0.3f);
333: stops.addAll(Arrays.asList(stopWords));
334: return stops;
335: // return Collections.unmodifiableSet(stops);
336: }
337:
338: ///////////////////////////////////////////////////////////////////////////////
339: // Nested classes:
340: ///////////////////////////////////////////////////////////////////////////////
341: /**
342: * The work horse; performance isn't fantastic, but it's not nearly as bad
343: * as one might think - kudos to the Sun regex developers.
344: */
345: private static final class PatternTokenizer extends TokenStream {
346:
347: private final String str;
348: private final boolean toLowerCase;
349: private Matcher matcher;
350: private int pos = 0;
351: private static final Locale locale = Locale.getDefault();
352:
353: public PatternTokenizer(String str, Pattern pattern,
354: boolean toLowerCase) {
355: this .str = str;
356: this .matcher = pattern.matcher(str);
357: this .toLowerCase = toLowerCase;
358: }
359:
360: public Token next() {
361: if (matcher == null)
362: return null;
363:
364: while (true) { // loop takes care of leading and trailing boundary cases
365: int start = pos;
366: int end;
367: boolean isMatch = matcher.find();
368: if (isMatch) {
369: end = matcher.start();
370: pos = matcher.end();
371: } else {
372: end = str.length();
373: matcher = null; // we're finished
374: }
375:
376: if (start != end) { // non-empty match (header/trailer)
377: String text = str.substring(start, end);
378: if (toLowerCase)
379: text = text.toLowerCase(locale);
380: return new Token(text, start, end);
381: }
382: if (!isMatch)
383: return null;
384: }
385: }
386:
387: }
388:
389: ///////////////////////////////////////////////////////////////////////////////
390: // Nested classes:
391: ///////////////////////////////////////////////////////////////////////////////
392: /**
393: * Special-case class for best performance in common cases; this class is
394: * otherwise unnecessary.
395: */
396: private static final class FastStringTokenizer extends TokenStream {
397:
398: private final String str;
399: private int pos;
400: private final boolean isLetter;
401: private final boolean toLowerCase;
402: private final Set stopWords;
403: private static final Locale locale = Locale.getDefault();
404:
405: public FastStringTokenizer(String str, boolean isLetter,
406: boolean toLowerCase, Set stopWords) {
407: this .str = str;
408: this .isLetter = isLetter;
409: this .toLowerCase = toLowerCase;
410: this .stopWords = stopWords;
411: }
412:
413: public Token next() {
414: // cache loop instance vars (performance)
415: String s = str;
416: int len = s.length();
417: int i = pos;
418: boolean letter = isLetter;
419:
420: int start = 0;
421: String text;
422: do {
423: // find beginning of token
424: text = null;
425: while (i < len && !isTokenChar(s.charAt(i), letter)) {
426: i++;
427: }
428:
429: if (i < len) { // found beginning; now find end of token
430: start = i;
431: while (i < len && isTokenChar(s.charAt(i), letter)) {
432: i++;
433: }
434:
435: text = s.substring(start, i);
436: if (toLowerCase)
437: text = text.toLowerCase(locale);
438: // if (toLowerCase) {
439: //// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
440: //// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
441: // text = s.substring(start, i).toLowerCase();
442: //// char[] chars = new char[i-start];
443: //// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
444: //// text = new String(chars);
445: // } else {
446: // text = s.substring(start, i);
447: // }
448: }
449: } while (text != null && isStopWord(text));
450:
451: pos = i;
452: return text != null ? new Token(text, start, i) : null;
453: }
454:
455: private boolean isTokenChar(char c, boolean isLetter) {
456: return isLetter ? Character.isLetter(c) : !Character
457: .isWhitespace(c);
458: }
459:
460: private boolean isStopWord(String text) {
461: return stopWords != null && stopWords.contains(text);
462: }
463:
464: }
465:
466: ///////////////////////////////////////////////////////////////////////////////
467: // Nested classes:
468: ///////////////////////////////////////////////////////////////////////////////
469: /**
470: * A StringReader that exposes it's contained string for fast direct access.
471: * Might make sense to generalize this to CharSequence and make it public?
472: */
473: static final class FastStringReader extends StringReader {
474:
475: private final String s;
476:
477: FastStringReader(String s) {
478: super (s);
479: this .s = s;
480: }
481:
482: String getString() {
483: return s;
484: }
485: }
486:
487: }
|