001: package bdd.search.spider;
002:
003: import java.util.Vector;
004: import java.util.StringTokenizer;
005: import java.net.URL;
006: import java.net.MalformedURLException;
007: import java.io.InputStream;
008: import java.io.File;
009: import java.io.FileInputStream;
010: import java.io.IOException;
011:
012: /** Written by Tim Macinta 1997 <br>
013: * Distributed under the GNU Public License
014: * (a copy of which is enclosed with the source). <br>
015: * <br>
016: * This LinkExtractor can extract URLs from HTML files.
017: */
018:
019: public class HTMLLinkExtractor implements LinkExtractor {
020:
021: Vector urls = new Vector(6, 9); // list of URLs
022: int next_url = 0; // next URL to return
023: int url_count = 0; // number of URLs
024: URL base = null; // base URL
025:
026: /** Creates a new HTMLLinkExtractor that will enumerate all the
027: * URLs in the give "cache_file".
028: */
029: public HTMLLinkExtractor(File cache_file, URL base_url)
030: throws IOException {
031: this .base = base_url;
032: InputStream in = new FileInputStream(cache_file);
033: int state = 0;
034: StringBuffer sb = new StringBuffer();
035: int i = in.read();
036: while (i >= 0) {
037: switch (state) {
038: case 0:
039: if (i == '<')
040: state = '<';
041: break;
042: case '<':
043: if (i == '>') {
044: state = 0;
045: analyze(sb.toString());
046: sb.setLength(0);
047: } else {
048: sb.append((char) i);
049: }
050: }
051: i = in.read();
052: }
053: if (sb.length() > 0)
054: analyze(sb.toString());
055: in.close();
056: }
057:
058: /** Analyzes "param", which should be the contents between a '<' and a '>',
059: * and adds any URLs that are found to the list of URLs.
060: */
061: public void analyze(String param) {
062: StringTokenizer st = new StringTokenizer(param);
063: if (st.countTokens() < 2)
064: return;
065: String first_word = st.nextToken().toLowerCase();
066: if (first_word.equals("a")) {
067: analyzeAnchor(st.nextToken(""));
068: } else if (first_word.equals("frame")) {
069: analyzeFrame(st.nextToken(""));
070: } else if (first_word.equals("base")) {
071: extractBase(st.nextToken(""));
072: }
073: }
074:
075: /** Analyzes the <a> tag. */
076: void analyzeAnchor(String anchor) {
077: String href = extract(anchor, "href");
078: if (href == null)
079: return;
080: try {
081: addURL(new URL(base, href));
082: } catch (MalformedURLException e) {
083: anchor = anchor.toLowerCase();
084: // java doesn't understand mailto and will throw an exception
085: if (!href.startsWith("mailto:")) {
086: e.printStackTrace();
087: }
088: }
089: }
090:
091: /** Analyzes the <frame> tag. */
092: void analyzeFrame(String frame) {
093: String src = extract(frame, "src");
094: if (src == null)
095: return;
096: try {
097: addURL(new URL(base, src));
098: } catch (MalformedURLException e) {
099: e.printStackTrace();
100: }
101: }
102:
103: /** Extracts the base URL from the <base> tag. */
104: void extractBase(String b) {
105: String b2 = extract(b, "href");
106: if (b2 != null) {
107: try {
108: base = new URL(base, b2);
109: } catch (MalformedURLException e) {
110: e.printStackTrace();
111: }
112: }
113: }
114:
115: /** Adds "url" to the list of URLs. */
116: public void addURL(URL url) {
117: urls.addElement(url);
118: url_count++;
119: }
120:
121: public boolean hasMoreElements() {
122: return url_count != next_url;
123: }
124:
125: public Object nextElement() {
126: Object ob = urls.elementAt(next_url);
127: next_url++;
128: return ob;
129: }
130:
131: /** Resets this enumeration. */
132: public void reset() {
133: next_url = 0;
134: }
135:
136: /** Returns the value in "line" associated with "key", or null if "key"
137: * is not found. For instance, if line were "a href="blah blah blah"
138: * and "key" were "href" this method would return "blah blah blah".
139: * <p>
140: * Keys are case insensitive.
141: */
142: String extract(String line, String key) {
143: try {
144: key = key.toLowerCase();
145: String lower_case = line.toLowerCase();
146: int i = lower_case.indexOf(key);
147: if (i < 0)
148: return null;
149: i += key.length();
150: if (line.charAt(i) != '=')
151: return null;
152: i++;
153: int i2;
154: if (line.charAt(i) == '"') {
155: i++;
156: i2 = line.indexOf('"', i);
157: if (i2 < 0) {
158: return line.substring(i);
159: } else {
160: return line.substring(i, i2);
161: }
162: } else {
163: int targ = line.length();
164: for (i2 = i; i < targ; i++) {
165: if (Character.isSpace(line.charAt(i)))
166: break;
167: }
168: return line.substring(i, i2);
169: }
170: } catch (StringIndexOutOfBoundsException e) {
171: }
172: return null;
173: }
174:
175: }
|