001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.solr.analysis;
017:
018: import org.apache.solr.core.Config;
019: import org.apache.lucene.analysis.StopFilter;
020: import org.apache.lucene.analysis.TokenStream;
021: import org.apache.lucene.analysis.TokenFilter;
022: import org.apache.lucene.analysis.Token;
023:
024: import java.util.Map;
025: import java.util.List;
026: import java.util.Set;
027: import java.io.IOException;
028:
029: /**
030: * @author yonik
031: * @version $Id: EnglishPorterFilterFactory.java 472574 2006-11-08 18:25:52Z yonik $
032: */
033: public class EnglishPorterFilterFactory extends BaseTokenFilterFactory {
034: public void init(Map<String, String> args) {
035: super .init(args);
036: String wordFile = args.get("protected");
037: if (wordFile != null) {
038: try {
039: List<String> wlist = Config.getLines(wordFile);
040: protectedWords = StopFilter
041: .makeStopSet((String[]) wlist
042: .toArray(new String[0]));
043: } catch (IOException e) {
044: throw new RuntimeException(e);
045: }
046: }
047: }
048:
049: private Set protectedWords = null;
050:
051: public TokenStream create(TokenStream input) {
052: return new EnglishPorterFilter(input, protectedWords);
053: }
054: }
055:
056: /** English Porter2 filter that doesn't use reflection to
057: /* adapt lucene to the snowball stemmer code.
058: */
059: class EnglishPorterFilter extends TokenFilter {
060: private final Set protWords;
061: private net.sf.snowball.ext.EnglishStemmer stemmer;
062:
063: public EnglishPorterFilter(TokenStream source, Set protWords) {
064: super (source);
065: this .protWords = protWords;
066: stemmer = new net.sf.snowball.ext.EnglishStemmer();
067: }
068:
069: /** the original code from lucene sandbox
070: public final Token next() throws IOException {
071: Token token = input.next();
072: if (token == null)
073: return null;
074: stemmer.setCurrent(token.termText());
075: try {
076: stemMethod.invoke(stemmer, EMPTY_ARGS);
077: } catch (Exception e) {
078: throw new RuntimeException(e.toString());
079: }
080: return new Token(stemmer.getCurrent(),
081: token.startOffset(), token.endOffset(), token.type());
082: }
083: **/
084:
085: public Token next() throws IOException {
086: Token tok = input.next();
087: if (tok == null)
088: return null;
089: String tokstr = tok.termText();
090:
091: // if protected, don't stem. use this to avoid stemming collisions.
092: if (protWords != null && protWords.contains(tokstr)) {
093: return tok;
094: }
095:
096: stemmer.setCurrent(tokstr);
097: stemmer.stem();
098: String newstr = stemmer.getCurrent();
099: if (tokstr.equals(newstr)) {
100: return tok;
101: } else {
102: // TODO: it would be nice if I could just set termText directly like
103: // lucene packages can.
104: Token newtok = new Token(newstr, tok.startOffset(), tok
105: .endOffset(), tok.type());
106: newtok.setPositionIncrement(tok.getPositionIncrement());
107: return newtok;
108: }
109:
110: }
111: }
|