001: package org.apache.lucene.analysis;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.BufferedReader;
021: import java.io.File;
022: import java.io.FileReader;
023: import java.io.IOException;
024: import java.io.Reader;
025: import java.util.HashMap;
026: import java.util.HashSet;
027:
028: /**
029: * Loader for text files that represent a list of stopwords.
030: *
031: *
032: * @version $Id: WordlistLoader.java 564236 2007-08-09 15:21:19Z gsingers $
033: */
034: public class WordlistLoader {
035:
036: /**
037: * Loads a text file and adds every line as an entry to a HashSet (omitting
038: * leading and trailing whitespace). Every line of the file should contain only
039: * one word. The words need to be in lowercase if you make use of an
040: * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
041: *
042: * @param wordfile File containing the wordlist
043: * @return A HashSet with the file's words
044: */
045: public static HashSet getWordSet(File wordfile) throws IOException {
046: HashSet result = new HashSet();
047: FileReader reader = null;
048: try {
049: reader = new FileReader(wordfile);
050: result = getWordSet(reader);
051: } finally {
052: if (reader != null)
053: reader.close();
054: }
055: return result;
056: }
057:
058: /**
059: * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
060: * leading and trailing whitespace). Every line of the Reader should contain only
061: * one word. The words need to be in lowercase if you make use of an
062: * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
063: *
064: * @param reader Reader containing the wordlist
065: * @return A HashSet with the reader's words
066: */
067: public static HashSet getWordSet(Reader reader) throws IOException {
068: HashSet result = new HashSet();
069: BufferedReader br = null;
070: try {
071: if (reader instanceof BufferedReader) {
072: br = (BufferedReader) reader;
073: } else {
074: br = new BufferedReader(reader);
075: }
076: String word = null;
077: while ((word = br.readLine()) != null) {
078: result.add(word.trim());
079: }
080: } finally {
081: if (br != null)
082: br.close();
083: }
084: return result;
085: }
086:
087: /**
088: * Reads a stem dictionary. Each line contains:
089: * <pre>word<b>\t</b>stem</pre>
090: * (i.e. two tab seperated words)
091: *
092: * @return stem dictionary that overrules the stemming algorithm
093: * @throws IOException
094: */
095: public static HashMap getStemDict(File wordstemfile)
096: throws IOException {
097: if (wordstemfile == null)
098: throw new NullPointerException(
099: "wordstemfile may not be null");
100: HashMap result = new HashMap();
101: BufferedReader br = null;
102: FileReader fr = null;
103: try {
104: fr = new FileReader(wordstemfile);
105: br = new BufferedReader(fr);
106: String line;
107: while ((line = br.readLine()) != null) {
108: String[] wordstem = line.split("\t", 2);
109: result.put(wordstem[0], wordstem[1]);
110: }
111: } finally {
112: if (fr != null)
113: fr.close();
114: if (br != null)
115: br.close();
116: }
117: return result;
118: }
119:
120: }
|