001: package bdd.search;
002:
003: import java.net.URL;
004: import java.net.MalformedURLException;
005: import java.io.File;
006: import java.util.Vector;
007: import java.util.Hashtable;
008: import java.util.Enumeration;
009: import java.io.IOException;
010: import java.io.DataInputStream;
011: import java.io.FileInputStream;
012:
013: /** Written by Tim Macinta 1997 <br>
014: * Distributed under the GNU Public License
015: * (a copy of which is enclosed with the source). <br>
016: * <br>
017: * Encapsulates the preferences for the crawler and the search
018: * engine.
019: */
020:
021: public class EnginePrefs {
022: /** The time to pause between URL fetches (in seconds). */
023: public int pause_time = 5;
024:
025: File main_dir = new File("searchdb"); // directory containing main index
026: // // and custom html files
027: File main_index = new File(main_dir, "main.db"); // main index
028: File rules = new File(main_dir, "rules.txt"); // inclusion/exclusion rules
029: File header = new File(main_dir, "header.html"); // header file
030: File footer = new File(main_dir, "footer.html"); // footer file
031: File notfound = new File(main_dir, "notfound.html"); // query not found file
032: File url_list = new File(main_dir, "urls.txt"); // list of starting URLs
033: File working_dir = new File("searchtmp"); // temporary working directory
034:
035: Vector excluded = new Vector(3, 10); // excluded URLs
036: Vector included = new Vector(3, 10); // included URLs
037: Hashtable hosts = new Hashtable(3, 6); // hosts where we've read robots.txt
038: String user_agent = "BDDBot"; // name used when retrieving URLs
039: String email_address = "nobody@nowhere.edu"; // administrator's email address
040: boolean filter_cgi = true; // filter out cgi urls?
041:
042: Monitor monitor = null; // query and url monitor
043: public static int port = 8001; // default web server port
044:
045: public EnginePrefs() {
046: if (!main_dir.exists())
047: main_dir.mkdir();
048: if (!working_dir.exists())
049: working_dir.mkdir();
050: try {
051: readRulesFile();
052: } catch (IOException e) {
053: e.printStackTrace();
054: }
055: }
056:
057: /** Returns true if "url" is allowed to be indexed and false otherwise. */
058: public boolean URLAllowed(URL url) {
059: URL u2;
060: String protocol = url.getProtocol();
061: String host = url.getHost();
062: int port = url.getPort();
063: if (port < 0 && protocol.equals("http"))
064: port = 80;
065: String file = url.getFile();
066:
067: // filter out cgi scripts
068:
069: if (filter_cgi) {
070: if (file.indexOf('?') > -1)
071: return false;
072: if (file.startsWith("/cgi-bin/"))
073: return false;
074: }
075:
076: // check exclusion rules
077:
078: int p;
079: Enumeration en = excluded.elements();
080: while (en.hasMoreElements()) {
081: u2 = (URL) en.nextElement();
082: p = u2.getPort();
083: if (protocol.equals(u2.getProtocol())
084: && host.equals(u2.getHost())
085: && (port == p || (port == 80 && p == -1))
086: && file.startsWith(u2.getFile()))
087: return false;
088: }
089:
090: // include all files that aren't excluded
091:
092: if (protocol.equals("file"))
093: return true;
094:
095: // check inclusion rules
096:
097: en = included.elements();
098: while (en.hasMoreElements()) {
099: u2 = (URL) en.nextElement();
100: p = u2.getPort();
101: if (protocol.equals(u2.getProtocol())
102: && host.equals(u2.getHost())
103: && (port == p || (port == 80 && p == -1))
104: && file.startsWith(u2.getFile())) {
105: if (protocol.equals("http")
106: && hosts.get(host + ":" + port) == null) {
107: readRobotsDotText(host, port);
108: hosts.put(host + ":" + port, Boolean.TRUE);
109: return URLAllowed(url);
110: } else {
111: return true;
112: }
113: }
114: }
115: return false;
116: }
117:
118: /** Pauses for the amount of time that has been specified for pausing
119: * between URL fetches. */
120: public void pauseBetweenURLs() {
121: long diff = pause_time * -1000L;
122: long start = System.currentTimeMillis();
123: while (diff < 0) {
124: try {
125: Thread.sleep(-diff);
126: } catch (InterruptedException e) {
127: }
128: diff = System.currentTimeMillis() - start - pause_time
129: * -1000L;
130: }
131: }
132:
133: public File getMainIndex() {
134: return main_index;
135: }
136:
137: public File getMainDir() {
138: return main_dir;
139: }
140:
141: /** Returns the working directory for use by a crawler. If more than
142: * one crawler is running at the same time they should be given different
143: * working directories.
144: */
145: public File getWorkingDir() {
146: return working_dir;
147: }
148:
149: public File getHeaderFile() {
150: return header;
151: }
152:
153: public File getFooterFile() {
154: return footer;
155: }
156:
157: public File getNotFoundFile() {
158: return notfound;
159: }
160:
161: public File getStartingFile() {
162: return url_list;
163: }
164:
165: /** The rules file contains rules which determine what URLs are allowed
166: * and what URLs whould be excluded. A line that is in the form:
167: * <pre>
168: * include http://gsd.mit.edu/
169: * </pre>
170: * will cause all URLs that start with "http://gsd.mit.edu/" to be
171: * included. Similarly, to exclude URLs, use the keyword "exclude"
172: * instead of "include". Blank lines and lines starting with "#" are
173: * ignored.
174: * <p>
175: * When an URL is checked against the inclusion/exclusion rules the
176: * exclusion rules are checked first and if the URL matches an
177: * exclusion rule it is not included. If an URL is not covered by
178: * either rule it is not included, unless it is a "file://" URL in
179: * which case it is included by default.
180: */
181: public File getRulesFile() {
182: return rules;
183: }
184:
185: /** Causes the inclusion/exclusion rules to be read. This method should
186: * be called if the rules file is changed.
187: */
188: public void readRulesFile() throws IOException {
189: excluded.removeAllElements();
190: included.removeAllElements();
191: hosts.clear();
192: DataInputStream in = new DataInputStream(new FileInputStream(
193: rules));
194: String line = in.readLine();
195: while (line != null) {
196: line = line.trim();
197: try {
198: if (line.startsWith("include ")) {
199: included.addElement(new URL(line.substring(8)));
200: } else if (line.startsWith("exclude ")) {
201: excluded.addElement(new URL(line.substring(8)));
202: }
203: } catch (MalformedURLException e) {
204: e.printStackTrace();
205: }
206: line = in.readLine();
207: }
208: in.close();
209: }
210:
211: /** Reads the "robots.txt" file on the given host and uses the results
212: * to determine what files on "host" are crawlable.
213: */
214: public void readRobotsDotText(String host, int port) {
215: try {
216: if (port < 0)
217: port = 80;
218: URL url = new URL("http", host, port, "/robots.txt");
219: DataInputStream in = new DataInputStream(url.openStream());
220: boolean relevant = false;
221: String line = in.readLine();
222: String lower_case;
223: String us = user_agent.toLowerCase();
224: while (line != null) {
225: line = line.trim();
226: lower_case = line.toLowerCase();
227: if (lower_case.startsWith("user-agent:")) {
228:
229: // determine if the following directives apply to us
230:
231: int i = 11;
232: while (Character.isSpace(line.charAt(i)))
233: i++;
234: int i2 = lower_case.indexOf(' ', i);
235: if (i2 < 0)
236: i2 = lower_case.length();
237: lower_case = lower_case.substring(i, i2);
238: if (lower_case.endsWith("*")) {
239: lower_case = lower_case.substring(0, lower_case
240: .length() - 1);
241: }
242: relevant = us.startsWith(lower_case);
243: } else if (lower_case.startsWith("disallow:")) {
244:
245: // assimilate directive if applicable
246:
247: if (relevant) {
248: int i = 9;
249: while (Character.isSpace(line.charAt(i)))
250: i++;
251: int i2 = line.indexOf(' ', i);
252: if (i2 < 0)
253: i2 = line.length();
254: line = line.substring(i, i2);
255: excluded.addElement(new URL("http", host, port,
256: line));
257: }
258: }
259: line = in.readLine();
260: }
261: in.close();
262: } catch (IOException e) {
263: } catch (StringIndexOutOfBoundsException e2) {
264: }
265: }
266:
267: public String getUserAgent() {
268: return user_agent;
269: }
270:
271: public String getEmailAddress() {
272: return email_address;
273: }
274:
275: public Monitor getMonitor() {
276: return monitor;
277: }
278:
279: /** Returns true if this URL represents a file type that is not indexable. */
280: public boolean URLNotIndexable(URL url) {
281: String f = url.getFile().toLowerCase();
282: return (f.endsWith(".gif") || f.endsWith(".tif")
283: || f.endsWith(".map") || f.endsWith(".jpg")
284: || f.endsWith(".ppt") || f.endsWith(".doc")
285: || f.endsWith(".pdf") || f.endsWith(".xls") || f
286: .endsWith(".rtf"));
287: }
288:
289: }
|