001: package net.matuschek.jobo;
002:
003: /************************************************
004: Copyright (c) 2001/2002 by Daniel Matuschek
005: *************************************************/
006:
007: import java.io.File;
008: import java.net.URL;
009:
010: import net.matuschek.getopt.GetOpt;
011: import net.matuschek.http.HttpDocToFile;
012: import net.matuschek.http.SystemOutHttpToolCallback;
013: import net.matuschek.spider.WebRobot;
014:
015: import org.apache.log4j.Category;
016: import org.apache.log4j.PropertyConfigurator;
017: import org.apache.log4j.BasicConfigurator;
018:
019: /**
020: * This is the JoBo command line interface.
021: *
022: * @author Daniel Matuschek
023: * @version $Id $
024: */
025: public class JoBo {
026:
027: protected static Category log = Category.getInstance("");
028:
029: public static void printUsage() {
030: final String usageInfo = "command line options: \n"
031: + " [-r http://...] set start referer (default \"-\")\n"
032: + " [-d maxdepth] set maximal search depth (default 0)\n"
033: + " [-o] allow walk to other hosts (default no)\n"
034: + " [-s directory] directory to store retrieved documents\n"
035: + " (default \".\")\n"
036: + " [-m minsize] store only files larger then this size in bytes\n"
037: + " (default 0)\n"
038: + " [-a agentName] set user agent name\n"
039: + " (default \"JoBo\")\n"
040: + " [-i] ignore robots.txt\n"
041: + " [-w seconds] wait n seconds after retrieving a file to limit\n"
042: + " load on the remote server (default 60)\n"
043: + " [-v] verbose mode, useful, if something is wrong\n"
044: + " with the XML configuration\n"
045: + " url start URL";
046:
047: System.out.println(usageInfo + "\n\n");
048: }
049:
050: /**
051: initialize log4j logging subsystem
052: **/
053: public static void initializeLogging() {
054: final String configfile = "logging.conf";
055:
056: File f = new File(configfile);
057: if (f.exists()) {
058: // read the logging properties from configuration file
059: PropertyConfigurator.configure(configfile);
060: } else {
061: BasicConfigurator.configure();
062: }
063: }
064:
065: public static void main(String[] argv) throws Exception {
066: String basedir = ".";
067: int minSize = 0;
068:
069: initializeLogging();
070:
071: if (argv.length < 1) {
072: printUsage();
073: return;
074: }
075:
076: // get command line options
077: GetOpt opt = new GetOpt(argv);
078: String option = null;
079:
080: JoBoBase jobobase = JoBoBase.createFromXML();
081: WebRobot robby = jobobase.getRobot();
082:
083: // referer
084: option = opt.getOptionString("r");
085: if (option != null) {
086: robby.setStartReferer(option);
087: }
088:
089: // maximal depth
090: option = opt.getOptionString("d");
091: if (option != null) {
092: try {
093: int maxDepth = Integer.parseInt(option);
094: robby.setMaxDepth(maxDepth);
095: } catch (NumberFormatException e) {
096: System.out.println("Wrong number for maxDepth: "
097: + option);
098: }
099: }
100:
101: // walk to other hosts ?
102: if (opt.getOptionBoolean("o")) {
103: robby.setWalkToOtherHosts(true);
104: }
105:
106: // store directory
107: option = opt.getOptionString("s");
108: if (option != null) {
109: basedir = option;
110: }
111:
112: // minimal file size
113: option = opt.getOptionString("m");
114: if (option != null) {
115: try {
116: minSize = Integer.parseInt(option);
117: } catch (NumberFormatException e) {
118: }
119: }
120:
121: // agent name
122: option = opt.getOptionString("a");
123: if (option != null) {
124: robby.setAgentName(option);
125: }
126:
127: // ignore robots.txt
128: if (opt.getOptionBoolean("i")) {
129: robby.setIgnoreRobotsTxt(true);
130: }
131:
132: // wait time
133: option = opt.getOptionString("w");
134: if (option != null) {
135: try {
136: int waitTime = Integer.parseInt(option);
137: robby.setSleepTime(waitTime * 1000);
138: } catch (NumberFormatException e) {
139: }
140: }
141:
142: // print usage
143: if (opt.getOptionBoolean("?")) {
144: printUsage();
145: return;
146: }
147:
148: URL u = new URL(argv[argv.length - 1]);
149:
150: HttpDocToFile docStore = new HttpDocToFile(basedir);
151: docStore.setMinFileSize(minSize);
152:
153: SystemOutHttpToolCallback statusInfo = new SystemOutHttpToolCallback();
154:
155: robby.setStartURL(u);
156: robby.setDocManager(docStore);
157: robby.setHttpToolCallback(statusInfo);
158:
159: robby.run();
160:
161: }
162: }
|