001: package net.matuschek.jobo;
002:
003: /************************************************
004: Copyright (c) 2001/2002 by Daniel Matuschek
005: *************************************************/
006:
007: import java.io.File;
008: import java.io.FileWriter;
009: import java.io.Writer;
010:
011: import net.matuschek.http.DownloadRuleSet;
012: import net.matuschek.http.HttpDocToFile;
013: import net.matuschek.http.HttpToolCallback;
014: import net.matuschek.spider.RegExpURLCheck;
015: import net.matuschek.spider.WebRobot;
016: import net.matuschek.spider.WebRobotCallback;
017: import net.matuschek.spider.docfilter.FilterChain;
018: import net.matuschek.spider.docfilter.LinkLocalizer;
019:
020: import org.apache.log4j.Category;
021:
022: import org.exolab.castor.mapping.Mapping;
023: import org.exolab.castor.xml.Marshaller;
024: import org.exolab.castor.xml.Unmarshaller;
025:
026: import org.xml.sax.InputSource;
027:
028: /**
029: * This is a simple class that contains all needed features for JoBo
030: * (the web robot, the download rules, RegExpUrlCheck ...)
031: *
032: * @author Daniel Matuschek
033: * @version $Revision: 1.21 $
034: */
035: public class JoBoBase {
036:
037: /** Log4J logging */
038: private static Category log = Category.getInstance("");
039:
040: /** The file used for XML->Java mapping */
041: private static String mappingfile = "mapping.xml";
042:
043: /** The jobo configuration in XML */
044: private static String xmlconfig = "jobo.xml";
045:
046: /** Start URL for the robot */
047: // private static String startUrl=null;
048: private String storageDirectory = "/tmp";
049: private WebRobot robot = null;
050: private RegExpURLCheck urlcheck = null;
051: private DownloadRuleSet downloadrules = null;
052: private HttpDocToFile docstore = null;
053:
054: /** Filter to localize included links */
055: private LinkLocalizer linkLocalizer = null;
056:
057: /** FilterChains with all filters */
058: private FilterChain filters = null;
059:
060: /**
061: * @exception ClassNotFoundException if the Robot could not be instantiated
062: * for some reason
063: */
064: public JoBoBase() throws ClassNotFoundException {
065: log = Category.getInstance(this .getClass());
066: docstore = new HttpDocToFile(storageDirectory);
067: initializeFilters();
068: robot = new WebRobot();
069: robot.setFilters(filters);
070: }
071:
072: /**
073: * Set the default filter chain
074: */
075: public void initializeFilters() {
076: filters = new FilterChain();
077: linkLocalizer = new LinkLocalizer();
078: filters.add(linkLocalizer);
079: }
080:
081: /**
082: * write the settings to an XML file
083: */
084: public void saveConfig(String filename) {
085: File f1 = new File(mappingfile);
086:
087: if (f1.exists()) {
088: Mapping mapping = new Mapping();
089: try {
090: mapping.loadMapping(mappingfile);
091: Writer writer = new FileWriter(filename);
092: Marshaller marshaller = new Marshaller(writer);
093: marshaller.setMapping(mapping);
094: marshaller.marshal(this );
095: writer.close();
096:
097: log.info("written to XML");
098: } catch (Exception e) {
099: log.error(e.getMessage());
100: e.printStackTrace();
101: }
102: } else {
103: log.error("mapping and/or configfile not found");
104: }
105: }
106:
107: public void registerHttpToolCallback(HttpToolCallback cb) {
108: robot.setHttpToolCallback(cb);
109: }
110:
111: public void registerWebRobotCallback(WebRobotCallback cb) {
112: robot.setWebRobotCallback(cb);
113: }
114:
115: /**
116: * registers the regexpurlcheck and the download rules with the robot
117: */
118: public void configureRobot() {
119: robot.setURLCheck(urlcheck);
120: robot.setDownloadRuleSet(downloadrules);
121: robot.setDocManager(docstore);
122: robot.setFilters(filters);
123: }
124:
125: /**
126: * Get the value of urlcheck.
127: * @return Value of urlcheck.
128: */
129: public RegExpURLCheck getURLCheck() {
130: return urlcheck;
131: }
132:
133: /**
134: * Set the value of urlcheck.
135: * @param v Value to assign to urlcheck.
136: */
137: public void setURLCheck(RegExpURLCheck urlcheck) {
138: this .urlcheck = urlcheck;
139: }
140:
141: /**
142: * Get the value of robot.
143: * @return Value of robot.
144: */
145: public WebRobot getRobot() {
146: return robot;
147: }
148:
149: /**
150: * Set the value of robot. The new Robot will use the
151: * filter that are defined in JoBoBase, even if he had
152: * its own FilterChain before.
153: *
154: * @param robot WebRobot object to use
155: */
156: public void setRobot(WebRobot robot) {
157: this .robot = robot;
158: robot.setFilters(filters);
159: }
160:
161: /**
162: * Localize links ?
163: *
164: * @param localize if this is true, JoBo will trz to replace
165: * absolute links by relative
166: */
167: public void setLocalizeLinks(boolean localize) {
168: if (localize) {
169: linkLocalizer.enable();
170: } else {
171: linkLocalizer.disable();
172: }
173: }
174:
175: /**
176: * is link localization enabled ?
177: */
178: public boolean getLocalizeLinks() {
179: return linkLocalizer.isEnabled();
180: }
181:
182: /**
183: * Get the value of downloadRules.
184: * @return Value of downloadRules.
185: */
186: public DownloadRuleSet getDownloadRuleSet() {
187: return downloadrules;
188: }
189:
190: /**
191: * Set the value of downloadRules.
192: * @param v Value to assign to downloadRules.
193: */
194: public void setDownloadRuleSet(DownloadRuleSet downloadRuleSet) {
195: this .downloadrules = downloadRuleSet;
196: }
197:
198: /**
199: * Get the value of storageDirectory.
200: * @return Value of storageDirectory.
201: */
202: public String getStorageDirectory() {
203: return storageDirectory;
204: }
205:
206: /**
207: * Set the value of storageDirectory.
208: * @param v Value to assign to storageDirectory.
209: */
210: public void setStorageDirectory(String storageDirectory) {
211: this .storageDirectory = storageDirectory;
212: docstore.setBaseDir(storageDirectory);
213: }
214:
215: /**
216: * Enable/disable storing of dynamic documents (with an "?"
217: * somewhere in the URL
218: *
219: * @param v true: enable storing of <b>all</b> documents,
220: * false: store only documents with an URL without "?"
221: */
222: public void setStoreCGI(boolean storeCGI) {
223: this .docstore.setStoreCGI(storeCGI);
224: }
225:
226: /**
227: * Get the status of storeCGI
228: *
229: * @return the current status of storeCGI
230: * @see #setStoreCGI for more information
231: */
232: public boolean getStoreCGI() {
233: return this .docstore.getStoreCGI();
234: }
235:
236: /**
237: * Unmarshall the object from an XML file (jobo.xml) in the current
238: * directory
239: *
240: * @exception ClassNotFoundException if the Robot could not be instantiated
241: * for some reason
242: */
243: public static JoBoBase createFromXML()
244: throws ClassNotFoundException {
245: return createFromXML(".");
246: }
247:
248: /**
249: * Unmarshall the object from an XML file
250: *
251: * @param configDirectory name of the directory where jobo.xml and
252: * mapping.xml should be read from.
253: * @exception ClassNotFoundException if the Robot could not be instantiated
254: * for some reason
255: */
256: public static JoBoBase createFromXML(String configDirectory)
257: throws ClassNotFoundException {
258: JoBoBase baseobj = null;
259:
260: xmlconfig = "jobo.xml";
261:
262: File f1 = new File(configDirectory + File.separatorChar
263: + mappingfile);
264: File f2 = new File(configDirectory + File.separatorChar
265: + xmlconfig);
266:
267: if (f1.exists() && f2.exists()) {
268: Mapping mapping = new Mapping();
269: try {
270: mapping.loadMapping(f1.getPath());
271: Unmarshaller unmar = new Unmarshaller(mapping);
272: unmar.setDebug(true);
273: baseobj = (JoBoBase) unmar.unmarshal(new InputSource(f2
274: .getPath()));
275:
276: log.info("configured from XML");
277:
278: } catch (Exception e) {
279: log.error(e.getMessage());
280: e.printStackTrace();
281: }
282: } else {
283: log.error("mapping and/or configfile not found");
284: }
285:
286: if (baseobj == null) {
287: baseobj = new JoBoBase();
288: }
289:
290: baseobj.configureRobot();
291:
292: return baseobj;
293: }
294:
295: } // JoBoBase
|