001: /*
002: * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
003: *
004: * Copyright (c) 2001 Brian Pitcher
005: *
006: * Permission is hereby granted, free of charge, to any person obtaining a
007: * copy of this software and associated documentation files (the "Software"),
008: * to deal in the Software without restriction, including without limitation
009: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
010: * and/or sell copies of the Software, and to permit persons to whom the
011: * Software is furnished to do so, subject to the following conditions:
012: *
013: * The above copyright notice and this permission notice shall be included in
014: * all copies or substantial portions of the Software.
015: *
016: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
017: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
018: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
019: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
020: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
021: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
022: * SOFTWARE.
023: */
024:
025: // $Header: /cvsroot/weblech/weblech/src/weblech/spider/SpiderConfig.java,v 1.9 2002/06/09 11:36:23 weblech Exp $
026: package weblech.spider;
027:
028: import weblech.util.Logger;
029:
030: import java.io.File;
031: import java.io.Serializable;
032: import java.util.*;
033: import java.net.URL;
034: import java.net.MalformedURLException;
035:
036: public class SpiderConfig extends Logger implements Serializable {
037: private File saveRootDirectory;
038: private File mailtoLogFile;
039:
040: private boolean refreshHTMLs;
041: private boolean refreshImages;
042: private boolean refreshOthers;
043:
044: private Set htmlExtensions;
045: private Set imageExtensions;
046:
047: private URL startLocation;
048: private String urlMatch;
049:
050: private List interestingURLSubstrings;
051: private List boringURLSubstrings;
052:
053: private boolean depthFirst;
054: private int maxDepth;
055:
056: private String userAgent;
057:
058: private String basicAuthUser;
059: private String basicAuthPassword;
060:
061: private int spiderThreads;
062:
063: private long checkpointInterval;
064:
065: /**
066: * Create a default config.
067: */
068: public SpiderConfig() {
069: _logClass.debug("SpiderConfig()");
070:
071: saveRootDirectory = new File(".");
072: mailtoLogFile = new File("mailto.txt");
073:
074: refreshHTMLs = true;
075: refreshImages = false;
076: refreshOthers = false;
077:
078: htmlExtensions = new HashSet();
079: htmlExtensions.add("htm");
080: htmlExtensions.add("html");
081: htmlExtensions.add("shtml");
082:
083: imageExtensions = new HashSet();
084: imageExtensions.add("jpg");
085: imageExtensions.add("gif");
086: imageExtensions.add("png");
087:
088: urlMatch = null;
089: interestingURLSubstrings = new ArrayList();
090: boringURLSubstrings = new ArrayList();
091: depthFirst = false;
092: maxDepth = 0;
093:
094: userAgent = "WebLech Spider 0.01alpha";
095: basicAuthUser = "";
096: basicAuthPassword = "";
097:
098: spiderThreads = 1;
099:
100: checkpointInterval = 0;
101: }
102:
103: /**
104: * Create a config from a java.util.Properties object.
105: */
106: public SpiderConfig(Properties props) {
107: _logClass.debug("SpiderConfig(props)");
108:
109: saveRootDirectory = new File(props.getProperty(
110: "saveRootDirectory", "."));
111: if (!saveRootDirectory.exists()) {
112: if (!saveRootDirectory.mkdirs()) {
113: _logClass.error("Couldn't create root directory: "
114: + saveRootDirectory);
115: _logClass.info("Defaulting to . instead");
116: saveRootDirectory = new File(".");
117: }
118: } else if (!saveRootDirectory.isDirectory()) {
119: _logClass.error("Save root is not a directory: "
120: + saveRootDirectory);
121: _logClass.info("Defaulting to . instead");
122: saveRootDirectory = new File(".");
123: }
124:
125: String mailtoFileStr = props.getProperty("mailtoLogFile",
126: "mailto.txt");
127: // Check if absolute or relative name given
128: if (mailtoFileStr.indexOf(":") != -1
129: || mailtoFileStr.startsWith("/")
130: || mailtoFileStr.startsWith("\\")) {
131: _logClass
132: .debug("Using absolute file name " + mailtoFileStr);
133: mailtoLogFile = new File(mailtoFileStr);
134: } else {
135: _logClass
136: .debug("Constructing relative file name "
137: + saveRootDirectory.getPath() + "/"
138: + mailtoFileStr);
139: mailtoLogFile = new File(saveRootDirectory.getPath() + "/"
140: + mailtoFileStr);
141: }
142:
143: refreshHTMLs = Boolean.valueOf(
144: props.getProperty("refreshHTMLs", "true"))
145: .booleanValue();
146: refreshImages = Boolean.valueOf(
147: props.getProperty("refreshImages", "false"))
148: .booleanValue();
149: refreshOthers = Boolean.valueOf(
150: props.getProperty("refreshOthers", "false"))
151: .booleanValue();
152:
153: htmlExtensions = parseSet(props.getProperty("htmlExtensions",
154: "htm,html,shtml"));
155: imageExtensions = parseSet(props.getProperty("imageExtensions",
156: "jpg,gif,png"));
157:
158: String startLocStr = props.getProperty("startLocation");
159: if (startLocStr != null) {
160: try {
161: startLocation = new URL(startLocStr);
162: } catch (MalformedURLException murle) {
163: _logClass.error(
164: "Caught MalformedURLException parsing start URL '"
165: + startLocStr + "' : "
166: + murle.getMessage(), murle);
167: }
168: } else {
169: _logClass.warn("startLocation not found in properties");
170: }
171:
172: urlMatch = props.getProperty("urlMatch");
173:
174: interestingURLSubstrings = parsePropCommaSeparated(props
175: .getProperty("interestingURLs"));
176: boringURLSubstrings = parsePropCommaSeparated(props
177: .getProperty("boringURLs"));
178:
179: depthFirst = Boolean.valueOf(
180: props.getProperty("depthFirst", "false"))
181: .booleanValue();
182: try {
183: String maxDepthStr = props.getProperty("maxDepth", "0");
184: maxDepth = Integer.parseInt(maxDepthStr);
185: } catch (NumberFormatException nfe) {
186: _logClass
187: .error(
188: "Caught number format exception parsing max depth, defaulting to 1",
189: nfe);
190: maxDepth = 1;
191: }
192:
193: userAgent = props.getProperty("userAgent",
194: "WebLech Spider 0.01alpha");
195: basicAuthUser = props.getProperty("basicAuthUser", "");
196: basicAuthPassword = props.getProperty("basicAuthPassword", "");
197:
198: try {
199: String threadsStr = props.getProperty("spiderThreads", "1");
200: spiderThreads = Integer.parseInt(threadsStr);
201: } catch (NumberFormatException nfe) {
202: _logClass
203: .error(
204: "Caught number format exception parsing number of threads, defaulting to 1",
205: nfe);
206: spiderThreads = 1;
207: }
208:
209: try {
210: String intervalStr = props.getProperty(
211: "checkpointInterval", "0");
212: checkpointInterval = Long.parseLong(intervalStr);
213: } catch (NumberFormatException nfe) {
214: _logClass
215: .error(
216: "Caught number format exception parsing checkpoint interval, defaulting to 0",
217: nfe);
218: spiderThreads = 1;
219: }
220: }
221:
222: private List parsePropCommaSeparated(String str) {
223: ArrayList result = new ArrayList();
224: if (str != null && str.length() > 0) {
225: StringTokenizer tok = new StringTokenizer(str, ",");
226: while (tok.hasMoreTokens()) {
227: result.add(tok.nextToken());
228: }
229: }
230: return result;
231: }
232:
233: public void setRefreshHTMLs(boolean refreshHTMLs) {
234: this .refreshHTMLs = refreshHTMLs;
235: }
236:
237: public boolean refreshHTMLs() {
238: return refreshHTMLs;
239: }
240:
241: public void setRefreshImages(boolean refreshImages) {
242: this .refreshImages = refreshImages;
243: }
244:
245: public boolean refreshImages() {
246: return refreshImages;
247: }
248:
249: public void setRefreshOthers(boolean refreshOthers) {
250: this .refreshOthers = refreshOthers;
251: }
252:
253: public boolean refreshOthers() {
254: return refreshOthers;
255: }
256:
257: public void setSaveRootDirectory(File saveRootDirectory) {
258: this .saveRootDirectory = saveRootDirectory;
259: }
260:
261: public File getSaveRootDirectory() {
262: return saveRootDirectory;
263: }
264:
265: public void setMailtoLogFile(File mailtoLogFile) {
266: this .mailtoLogFile = mailtoLogFile;
267: }
268:
269: public File getMailtoLogFile() {
270: return mailtoLogFile;
271: }
272:
273: public void setStartLocation(URL startLocation) {
274: this .startLocation = startLocation;
275: }
276:
277: public URL getStartLocation() {
278: return startLocation;
279: }
280:
281: public void setURLMatch(String urlMatch) {
282: this .urlMatch = urlMatch;
283: }
284:
285: public String getURLMatch() {
286: return urlMatch;
287: }
288:
289: public List getInterestingURLSubstrings() {
290: return interestingURLSubstrings;
291: }
292:
293: public void setInterestingURLSubstrings(
294: List interestingURLSubstrings) {
295: this .interestingURLSubstrings = interestingURLSubstrings;
296: }
297:
298: public List getBoringURLSubstrings() {
299: return boringURLSubstrings;
300: }
301:
302: public void setBoringURLSubstrings(List boringURLSubstrings) {
303: this .boringURLSubstrings = boringURLSubstrings;
304: }
305:
306: public boolean isInteresting(URL u) {
307: return matchURL(u, interestingURLSubstrings);
308: }
309:
310: public boolean isBoring(URL u) {
311: return matchURL(u, boringURLSubstrings);
312: }
313:
314: private boolean matchURL(URL u, List substrings) {
315: String str = u.toExternalForm();
316: for (Iterator i = substrings.iterator(); i.hasNext();) {
317: String substr = (String) i.next();
318: if (str.indexOf(substr) != -1) {
319: return true;
320: }
321: }
322: return false;
323: }
324:
325: public void setDepthFirstSearch(boolean depthFirst) {
326: this .depthFirst = depthFirst;
327: }
328:
329: public boolean isDepthFirstSearch() {
330: return depthFirst;
331: }
332:
333: public void setMaxDepth(int maxDepth) {
334: this .maxDepth = maxDepth;
335: }
336:
337: public int getMaxDepth() {
338: return maxDepth;
339: }
340:
341: public void setUserAgent(String userAgent) {
342: this .userAgent = userAgent;
343: }
344:
345: public String getUserAgent() {
346: return userAgent;
347: }
348:
349: public void setBasicAuthUser(String basicAuthUser) {
350: this .basicAuthUser = basicAuthUser;
351: }
352:
353: public String getBasicAuthUser() {
354: return basicAuthUser;
355: }
356:
357: public void setBasicAuthPassword(String basicAuthPassword) {
358: this .basicAuthPassword = basicAuthPassword;
359: }
360:
361: public String getBasicAuthPassword() {
362: return basicAuthPassword;
363: }
364:
365: public void setSpiderThreads(int spiderThreads) {
366: this .spiderThreads = spiderThreads;
367: }
368:
369: public int getSpiderThreads() {
370: return spiderThreads;
371: }
372:
373: public void setCheckpointInterval(long interval) {
374: this .checkpointInterval = interval;
375: }
376:
377: public long getCheckpointInterval() {
378: return checkpointInterval;
379: }
380:
381: public String toString() {
382: return "depthFirst:\t" + depthFirst + "\nmaxDepth:\t"
383: + maxDepth + "\nhtmlExtensions:\t"
384: + fromSet(htmlExtensions) + "\nimageExtensions:\t"
385: + fromSet(imageExtensions) + "\nrefreshHTMLs:\t"
386: + refreshHTMLs + "\nrefreshImages:\t" + refreshImages
387: + "\nrefreshOthers:\t" + refreshOthers
388: + "\nsaveRootDirectory:\t" + saveRootDirectory
389: + "\nstartLocation:\t" + startLocation
390: + "\nurlMatch:\t" + urlMatch + "\nuserAgent:\t"
391: + userAgent + "\nbasicAuthUser:\t" + basicAuthUser
392: + "\nbasicAuthPassword:\t" + "***"
393: + "\nspiderThreads:\t" + spiderThreads
394: + "\ncheckpointInterval:\t" + checkpointInterval;
395: }
396:
397: private Set parseSet(String str) {
398: _logClass.debug("parseSet(" + str + ")");
399: HashSet result = new HashSet();
400: StringTokenizer sTok = new StringTokenizer(str, ",");
401: while (sTok.hasMoreTokens()) {
402: String tok = sTok.nextToken().trim();
403: result.add(tok);
404: }
405: return result;
406: }
407:
408: private String fromSet(Set s) {
409: StringBuffer sb = new StringBuffer();
410: boolean first = true;
411: for (Iterator i = s.iterator(); i.hasNext();) {
412: String str = (String) i.next();
413: if (first) {
414: first = false;
415: } else {
416: sb.append(",");
417: }
418: sb.append(str);
419: }
420: return sb.toString();
421: }
422:
423: } // End class SpiderConfig
|