001: package bdd.search.spider;
002:
003: import java.net.URL;
004: import java.net.MalformedURLException;
005: import java.io.File;
006: import java.io.FileInputStream;
007: import java.io.DataInputStream;
008: import java.util.Hashtable;
009: import bdd.search.EnginePrefs;
010: import bdd.search.Monitor;
011: import bdd.util.FIFOQueue;
012:
013: /** Written by Tim Macinta 1997 <br>
014: * Distributed under the GNU Public License
015: * (a copy of which is enclosed with the source). <br>
016: * <br>
017: * Calling the Crawler's start() method will cause the Crawler to
018: * index all of the sites in its queue and then replace the main
019: * index with the updated index when it completes. The Crawler's
020: * queue should be filled with the starting URLs before calling
021: * start().
022: */
023: public class Crawler extends Thread {
024:
025: File working_dir; // directory for temp files
026: Indexer indexer; // handles post-crawl indexing
027: FIFOQueue q = new FIFOQueue(); // url queue
028: Hashtable urls_done = new Hashtable(40); // keeps track of what
029: // urls are already processed
030: EnginePrefs eng_prefs; // preferences
031: boolean exit_when_done = false; // exit when done indexing
032:
033: /** "working_dir" should be a directory that only this
034: * Crawler and a given Indexer will be
035: * accessing. This means that if several Crawlers are running
036: * simultaneously, they should all be given different "working_dir"
037: * directories. Also, no other threads should write to this
038: * directory (except for the selected Indexer).
039: */
040: public Crawler(File working_dir, EnginePrefs eng_prefs) {
041: this .eng_prefs = eng_prefs;
042: this .working_dir = working_dir;
043: indexer = new Indexer(working_dir, this , eng_prefs);
044: indexer.start();
045: }
046:
047: /** Takes "url_to_queue" and adds it to this Crawler's queue of URLs.
048: * This method should be used to add all of the desired starting URLs to
049: * the queue before the Crawler is started. If the URL has already
050: * been processed or if it is an unallowed URL it is not added.
051: */
052: public void addURL(URL url_to_queue) {
053: if (!eng_prefs.URLAllowed(url_to_queue))
054: return; // check if URL is allowed
055: if (eng_prefs.URLNotIndexable(url_to_queue))
056: return; //don't index non-text
057: url_to_queue = simplify(url_to_queue); // remove loops/anchors
058: if (urls_done.put(url_to_queue, url_to_queue) == null) {
059: q.addElement(url_to_queue); // add if not done already
060: Monitor m = eng_prefs.getMonitor();
061: if (m != null)
062: m.indexing(url_to_queue);
063: }
064: }
065:
066: /** Takes "url" and removes all references to "/./" and "/../" . This
067: * can be used to help eliminate looping. Also removes all anchors
068: * (i.e., everything after and including a '#'). */
069: URL simplify(URL url) {
070: String file = url.getFile();
071: boolean changed = false; // keep track of whether we change anything
072:
073: // collapse all occurances of "/./"
074:
075: int i = file.indexOf("/./");
076: while (i >= 0) {
077: changed = true;
078: file = file.substring(0, i) + file.substring(i + 2);
079: i = file.indexOf("/./");
080: }
081:
082: // collapse all occurances of "/../" (by removing preceding directory)
083:
084: i = file.indexOf("/../");
085: while (i >= 0) {
086: changed = true;
087: int i2 = file.lastIndexOf('/', i - 1);
088: if (i2 < 0)
089: i2 = i;
090: file = file.substring(0, i2) + file.substring(i + 3);
091: i = file.indexOf("/../");
092: }
093:
094: // remove anchor if necessary
095:
096: if (url.getRef() != null)
097: changed = true;
098:
099: // set port if it's not set already
100:
101: int port = url.getPort();
102: String proto = url.getProtocol().toLowerCase();
103: if (port < 0 && proto.equals("http")) {
104: changed = true;
105: port = 80;
106: }
107:
108: // create a new URL if anything changed
109:
110: if (changed) {
111: try {
112: url = new URL(proto, url.getHost(), port, file);
113: } catch (MalformedURLException e) {
114: e.printStackTrace();
115: }
116: }
117: return url;
118: }
119:
120: /** This is where the actual crawling occurs. */
121: public void run() {
122: if (!q.hasMoreElements())
123: return; // return if there's nothing to do
124: int tmp_file = 0; // used to generate unique temporary filenames
125:
126: URLStatus url_status;
127: while (true) {
128: url_status = new URLStatus((URL) q.nextElement(), new File(
129: working_dir, tmp_file + ".tmp"), eng_prefs);
130: tmp_file++;
131: url_status.readContent();
132: if (url_status.loaded()) {
133: indexer.queueURL(url_status);
134: } else if (url_status.moved()) {
135: addURL(url_status.actual_url);
136: } else {
137: Monitor m = eng_prefs.getMonitor();
138: if (m != null)
139: m.reportError(url_status.actual_url);
140: }
141: if (q.hasMoreElements()) {
142: eng_prefs.pauseBetweenURLs();
143: } else {
144: while (!q.hasMoreElements()
145: && indexer.q.hasMoreElements()) {
146: eng_prefs.pauseBetweenURLs();
147: }
148: if (!q.hasMoreElements()) {
149: break;
150: }
151: }
152: }
153: Monitor m = eng_prefs.getMonitor();
154: if (m != null)
155: m.crawlerDone(this );
156: indexer.stopWhenDone(exit_when_done);
157: }
158:
159: /** This is the method that is called when this class is invoked from
160: * the command line. calling this method will cause a Crawler to be
161: * created and started with the starting URLs being listed in a file
162: * specified by the first argument (arg[0]). The file listing the URLs
163: * should contain only the URLs with each URL on a line by itself. Blank
164: * lines are allowed and lines beginning with "#" are considered comments
165: * and are ignored.
166: */
167: public static void main(String arg[]) {
168: if (arg.length < 1)
169: return;
170: main(new File(arg[0]), new EnginePrefs(), true);
171: }
172:
173: public static void main(File file, EnginePrefs prefs) {
174: main(file, prefs, false);
175: }
176:
177: public static void main(File file, EnginePrefs prefs, boolean exit) {
178: Crawler cr = new Crawler(prefs.getWorkingDir(), prefs);
179: try {
180: DataInputStream in = new DataInputStream(
181: new FileInputStream(file));
182: String line = in.readLine();
183: while (line != null) {
184: line = line.trim();
185: if (!line.equals("") && !line.startsWith("#")) {
186: try {
187: cr.addURL(new URL(line));
188: } catch (MalformedURLException e2) {
189: e2.printStackTrace();
190: }
191: }
192: line = in.readLine();
193: }
194: cr.exit_when_done = exit;
195: cr.start();
196: } catch (Exception e) {
197: e.printStackTrace();
198: }
199: }
200:
201: }
|