01: package bdd.search.spider;
02:
03: import java.util.Enumeration;
04: import java.util.Hashtable;
05:
06: /** Written by Tim Macinta 1997 <br>
07: * Distributed under the GNU Public License
08: * (a copy of which is enclosed with the source). <br>
09: * <br>
10: * A WordExtractor should be able to extract the words from
11: * a given file. This class should be subclassed by classes which
12: * understand different document types.
13: */
14:
15: public class WordExtractor {
16:
17: Hashtable words = new Hashtable(200); // stores all the words
18: int current_index = 0; // index in document of current word
19:
20: public WordExtractor() {
21: }
22:
23: /** Returns an Enumeration that returns each word in the document in
24: * no particular order. A word is returned once at most regardless of
25: * the number of times it appears in the document. The Enumeration
26: * returns a String for each call to nextElement().
27: */
28: public Enumeration getWords() {
29: return words.keys();
30: }
31:
32: /** Returns the number of words in this document. */
33: public int countWords() {
34: return current_index;
35: }
36:
37: /** Returns a count of the number of times that "word" appears in the
38: * the document. */
39: public int countOccurances(String word) {
40: int[] stats = (int[]) words.get(word);
41: if (stats == null)
42: return 0;
43: return stats[0];
44: }
45:
46: /** Returns the index of "word". The index is determined by counting
47: * the words in the document until the first occurance of "word" is
48: * found. For instance, firstOccurance("the") would return 5 if the
49: * document started like this "Once upon a time the giant tomato of...".
50: * Returns -1 if the word is not in the document.
51: */
52: public int firstOccurance(String word) {
53: int[] stats = (int[]) words.get(word);
54: if (stats == null)
55: return -1;
56: return stats[1];
57: }
58:
59: /** Used internally to add a word to the list of words as they are found
60: * in the document. */
61: public void addWord(String word) {
62: word = word.toLowerCase();
63: int[] stats = (int[]) words.get(word);
64: current_index++;
65: if (stats == null) {
66: stats = new int[2];
67: stats[0] = 0; // number of time word appears
68: stats[1] = current_index; // first occurance of word
69: words.put(word, stats);
70: }
71: stats[0]++;
72: }
73:
74: }
|