01: package bdd.search.spider;
02:
03: import java.io.InputStream;
04: import java.io.FileInputStream;
05: import java.io.File;
06: import java.io.IOException;
07:
08: /** Written by Tim Macinta 1997 <br>
09: * Distributed under the GNU Public License
10: * (a copy of which is enclosed with the source). <br>
11: * <br>
12: * This class is capable of extracting words from a cached HTML file.
13: */
14: public class HTMLWordExtractor extends WordExtractor {
15:
16: public HTMLWordExtractor(File cache_file) throws IOException {
17: InputStream in = new FileInputStream(cache_file);
18: int state = 0;
19: StringBuffer sb = new StringBuffer();
20: int i = in.read();
21: while (i >= 0) {
22: switch (state) {
23: case '<':
24: if (i == '>')
25: state = 0;
26: break;
27: case 0:
28: if (Character.isLetterOrDigit((char) i)) {
29: sb.append((char) i);
30: } else {
31: if (sb.length() > 0) {
32: addWord(sb.toString());
33: sb.setLength(0);
34: }
35: if (i == '<') {
36: state = '<';
37: }
38: }
39: }
40: i = in.read();
41: }
42: if (sb.length() > 0)
43: addWord(sb.toString());
44: in.close();
45: }
46:
47: }
|