001: package bdd.search.spider;
002:
003: import java.net.URL;
004: import java.net.Socket;
005: import java.io.File;
006: import java.io.IOException;
007: import java.io.InputStream;
008: import java.io.OutputStream;
009: import java.io.PushbackInputStream;
010: import java.io.DataInputStream;
011: import java.io.DataOutputStream;
012: import java.io.FileOutputStream;
013: import java.io.BufferedOutputStream;
014: import java.util.Vector;
015: import java.util.Enumeration;
016: import bdd.search.EnginePrefs;
017:
018: /** Written by Tim Macinta 1997 <br>
019: * Distributed under the GNU Public License
020: * (a copy of which is enclosed with the source). <br>
021: * <br>
022: * This class holds information about the content at a particular URL.
023: * It can also be used to fetch and parse an URL.
024: */
025: public class URLStatus {
026:
027: URL given_url; // The given URL
028: URL actual_url = null; // The actual URL (differs in case of redirection)
029: File temp_file; // temporary file containing the contents of "url"
030: EnginePrefs eng_prefs; // engine preferences
031: String mime_type = ""; // mime type
032: String user_agent; // internal name for bot
033: String email_address; // email address of the administrator
034:
035: static final int LOADED = 0;
036: static final int NOT_LOADED = 1;
037: static final int MOVED = 2;
038: static final int DUPLICATE = 4;
039: static final int MISSING = 8;
040: static final int TIMED_OUT = 16;
041: static final int IO_ERROR = 32;
042: static final int UNSUPPORTED_MIMETYPE = 64;
043: static final int MISC_ERROR = 128;
044:
045: int status = NOT_LOADED; // gives the status of this URL
046:
047: /** "url" is the location of the information and "temp_file" is the
048: * temporary file that can be used to store the contents of this
049: * url.
050: */
051: public URLStatus(URL url, File temp_file, EnginePrefs eng_prefs) {
052: this .given_url = url;
053: this .actual_url = url;
054: this .temp_file = temp_file;
055: this .eng_prefs = eng_prefs;
056: user_agent = eng_prefs.getUserAgent();
057: email_address = eng_prefs.getEmailAddress();
058: }
059:
060: /** Returns true if and only if this URL was loaded without an error. */
061: public boolean loaded() {
062: return ((Integer.MAX_VALUE ^ LOADED) & status) == 0;
063: }
064:
065: /** Creates a database containing just this URL. */
066: public void dumpToDatabase(DataOutputStream out) throws IOException {
067:
068: // dump headers
069:
070: String url = actual_url.toExternalForm() + '\n';
071: out.writeLong(8 + 4 + url.length()); // pointer to start of words
072: out.writeInt(1); // number of URLs
073:
074: // dump description of URL
075:
076: out.writeBytes(url);
077:
078: // dump actual words
079:
080: dumpWords(out);
081: }
082:
083: /** Dumps the words contained in this URL in database format to "out". */
084: void dumpWords(DataOutputStream out) throws IOException {
085: WordExtractor we = getWordExtractor();
086: int len = we.countWords();
087: if (len < 1)
088: return;
089:
090: // alphabetize the words
091:
092: Vector v = new Vector(len, 5);
093: Enumeration en = we.getWords();
094: v.addElement(en.nextElement());
095: String word;
096: String word2;
097: boolean inserted;
098: while (en.hasMoreElements()) {
099: inserted = false;
100: word = (String) en.nextElement();
101: for (int i = v.size() - 1; i >= 0; i--) {
102: word2 = (String) v.elementAt(i);
103: if (word.compareTo(word2) > 0) {
104: v.insertElementAt(word, i + 1);
105: inserted = true;
106: break;
107: }
108: }
109: if (!inserted)
110: v.insertElementAt(word, 0);
111: }
112:
113: // dump the words
114:
115: en = v.elements();
116: int score;
117: int total_words = we.countWords();
118: while (en.hasMoreElements()) {
119: word = (String) en.nextElement();
120: out.writeBytes(word); // write words
121: out.write('\n');
122: out.writeInt(1); // write url number
123: score = (128 * (we.countOccurances(word) + total_words
124: - we.firstOccurance(word) + 1))
125: / total_words;
126: if (score > 255)
127: score = 255;
128: out.write(score); // write score
129: out.writeInt(0); // end of urls
130: }
131: }
132:
133: /** Returns a WordExtractor that can handle this URL's mime type.
134: * To add support for new mime types add a WordExtractor that handles
135: * those mime types here and add appropriate LinkExtractors to the
136: * getLinkExtractor() method. Also, add the mime type to the list in
137: * the mimeTypeUnderstood() method.
138: */
139: public WordExtractor getWordExtractor() throws IOException {
140: if (mime_type.equals("text/html")) {
141: return new HTMLWordExtractor(temp_file);
142: } else if (mime_type.equals("text/plain")) {
143: return new TextWordExtractor(temp_file);
144: }
145: return new WordExtractor();
146: }
147:
148: /** Returns a LinkExtractor that can handle this URL's mime type.
149: * To add support for new mime types add a LinkExtractor that handles
150: * those mime types here and add appropriate WordExtractors to the
151: * getWordExtractor() method. Also, add the mime type to the list in
152: * the mimeTypeUnderstood() method.
153: */
154: public LinkExtractor getLinkExtractor() throws IOException {
155: if (mime_type.equals("text/html")) {
156: return new HTMLLinkExtractor(temp_file, actual_url);
157: } else if (mime_type.equals("text/plain")) {
158: return new NullLinkExtractor();
159: }
160: return new NullLinkExtractor();
161: }
162:
163: /** Returns true if and only if this mime type can be processed. */
164: public boolean mimeTypeUnderstood(String mime_type) {
165: mime_type = mime_type.toLowerCase();
166:
167: // Add new mime types inside the conditional part of the 'if' statement.
168:
169: if (mime_type.equals("text/html")
170: || mime_type.equals("text/plain")) {
171: return true;
172: }
173: return false;
174: }
175:
176: /** Returns the file that is used to cache the contents of this URL. */
177: public File getCacheFile() {
178: return temp_file;
179: }
180:
181: /** Downloads the content of the given URL and stores it in a temporary
182: * cache file. */
183: public void readContent() {
184: String proto = given_url.getProtocol().toLowerCase();
185: try {
186: if (proto.equals("http")) {
187: readHTTP();
188: } else {
189: readGeneric();
190: }
191: } catch (IOException e) {
192: status |= IO_ERROR;
193: }
194: }
195:
196: /** Downloads a file using the HTTP protocol. It was necessary to
197: * write a method to do this from scratch rather than using the default
198: * method in Java because:
199: * <p><ul>
200: * <li> There is no means for specifying the user agent
201: * using the default method.
202: * <li> There is a bug in Java 1.0 implementation that makes
203: * it incompatible with HTTP version 1.1.
204: * <li> Redirects are automatically followed (at least in
205: * Java 1.0) without providing a way to determine
206: * whether a redirect has occured.
207: * </ul>
208: */
209: void readHTTP() throws IOException {
210: int port = given_url.getPort();
211: if (port < 0)
212: port = 80;
213: Socket sock = new Socket(given_url.getHost(), port);
214: PushbackInputStream pbin = new PushbackInputStream(sock
215: .getInputStream());
216: DataInputStream in = new DataInputStream(pbin);
217: DataOutputStream out = new DataOutputStream(
218: new BufferedOutputStream(sock.getOutputStream()));
219: OutputStream cache = new BufferedOutputStream(
220: new FileOutputStream(temp_file));
221: out.writeBytes("GET " + given_url.getFile() + " HTTP/1.0\n");
222: out.writeBytes("User-Agent: " + user_agent + "\n");
223: out.writeBytes("From: " + email_address + "\n");
224: out.writeBytes("Host: " + given_url.getHost() + "\n");
225: out.writeBytes("\n");
226: out.flush();
227: String line = readLine(pbin);
228: try {
229:
230: // try reading full response
231:
232: if (!line.toLowerCase().startsWith("http/")) {
233: throw new Exception();
234: }
235: int url_status = line.charAt(line.indexOf(' ') + 1);
236: switch (url_status) {
237: case '2': // request successful
238: String line2;
239: try {
240: while (true) {
241: line2 = in.readLine().toLowerCase();
242: if (line2 == null)
243: break;
244: if (line2.startsWith("content-type:")) {
245: if (line2.charAt(13) == ' ') {
246: mime_type = line2.substring(14);
247: } else {
248: mime_type = line2.substring(13);
249: }
250: if (!mimeTypeUnderstood(mime_type)) {
251: status |= UNSUPPORTED_MIMETYPE;
252: in.close();
253: out.close();
254: sock.close();
255: cache.close();
256: return;
257: }
258: } else if (line2.equals("")) {
259: break;
260: }
261: }
262: pipe(in, cache);
263: status = LOADED;
264: } catch (StringIndexOutOfBoundsException e) {
265: }
266: break;
267: case '3': // page has moved
268: status = MOVED;
269: line = in.readLine();
270: while (!line.toLowerCase().startsWith("location:"))
271: line = in.readLine();
272: if (line.charAt(9) == ' ') {
273: actual_url = new URL(line.substring(10).trim());
274: } else {
275: actual_url = new URL(line.substring(9).trim());
276: }
277: break;
278: case '4':
279: status |= MISSING;
280: break;
281: default:
282: status |= MISC_ERROR;
283: break;
284: }
285: } catch (Exception e) {
286:
287: // full response failed, now try simple response
288:
289: if (line != null)
290: new DataOutputStream(cache).writeBytes(line);
291: pipe(in, cache);
292: }
293: in.close();
294: out.close();
295: sock.close();
296: cache.close();
297: }
298:
299: /** A replacement for the java.io.DataInputStream which doesn't return
300: * the line ending characters like it should.
301: */
302: String readLine(PushbackInputStream in) throws IOException {
303: StringBuffer sb = new StringBuffer();
304: int c;
305: while (true) {
306: c = in.read();
307: switch (c) {
308: case '\n':
309: sb.append('\n');
310: return new String(sb);
311: case '\r':
312: sb.append('\r');
313: c = in.read();
314: if (c == '\n') {
315: sb.append('\n');
316: return new String(sb);
317: } else {
318: in.unread((char) c);
319: return new String(sb);
320: }
321: case -1:
322: if (sb.length() < 1)
323: return null;
324: return new String(sb);
325: default:
326: sb.append((char) c);
327: }
328: }
329: }
330:
331: /** This method provides a fallback to the default Java implementation
332: * for protocols which have not been re-implemented.
333: */
334: void readGeneric() throws IOException {
335:
336: // guess at mime type
337:
338: String url = given_url.toString().toLowerCase();
339: if (url.endsWith(".html") || url.endsWith(".htm")) {
340: mime_type = "text/html";
341: } else if (url.endsWith(".txt")) {
342: mime_type = "text/plain";
343: }
344:
345: // fetch URL
346:
347: InputStream in = given_url.openStream();
348: OutputStream out = new FileOutputStream(temp_file);
349: pipe(in, out);
350: out.close();
351: status = LOADED;
352: }
353:
354: /** Gets rid of the temporary file. */
355: public void finalize() throws Throwable {
356: temp_file.delete();
357: super .finalize();
358: }
359:
360: /** Pipes "in" to "out" until "in" is exhausted then closes "in". */
361: void pipe(InputStream in, OutputStream out) throws IOException {
362: byte[] b = new byte[512];
363: int x = in.read(b, 0, b.length);
364: while (x > 0) {
365: out.write(b, 0, x);
366: x = in.read(b, 0, b.length);
367: }
368: in.close();
369: }
370:
371: /** Returns true if and only if this URL causes a redirection. */
372: public boolean moved() {
373: return (status & MOVED) != 0;
374: }
375:
376: /** Returns the length of the content, or 0 if it's unknown. */
377: public long getContentLength() {
378: return temp_file.length();
379: }
380:
381: }
|