01: package bdd.search.spider;
02:
03: import java.io.InputStream;
04: import java.io.FileInputStream;
05: import java.io.File;
06: import java.io.IOException;
07:
08: /** Written by Tim Macinta 1997 <br>
09: * Distributed under the GNU Public License
10: * (a copy of which is enclosed with the source). <br>
11: * <br>
12: * This class is capable of extracting words from a cached text file.
13: */
14: public class TextWordExtractor extends WordExtractor {
15:
16: public TextWordExtractor(File cache_file) throws IOException {
17: InputStream in = new FileInputStream(cache_file);
18: StringBuffer sb = new StringBuffer();
19: int i = in.read();
20: while (i >= 0) {
21: if (Character.isLetterOrDigit((char) i)) {
22: sb.append((char) i);
23: } else {
24: if (sb.length() > 0) {
25: addWord(sb.toString());
26: sb.setLength(0);
27: }
28: }
29: i = in.read();
30: }
31: if (sb.length() > 0)
32: addWord(sb.toString());
33: in.close();
34: }
35:
36: }
|