001: /*
002: * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
003: *
004: * Copyright (c) 2001 Brian Pitcher
005: *
006: * Permission is hereby granted, free of charge, to any person obtaining a
007: * copy of this software and associated documentation files (the "Software"),
008: * to deal in the Software without restriction, including without limitation
009: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
010: * and/or sell copies of the Software, and to permit persons to whom the
011: * Software is furnished to do so, subject to the following conditions:
012: *
013: * The above copyright notice and this permission notice shall be included in
014: * all copies or substantial portions of the Software.
015: *
016: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
017: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
018: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
019: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
020: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
021: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
022: * SOFTWARE.
023: */
024:
025: // $Header: /cvsroot/weblech/weblech/src/weblech/spider/Spider.java,v 1.8 2002/06/09 11:34:38 weblech Exp $
026: package weblech.spider;
027:
028: import weblech.util.Logger;
029: import weblech.util.Log4j;
030:
031: import java.util.*;
032: import java.io.*;
033: import java.net.URL;
034:
035: import org.apache.log4j.Category;
036:
037: public class Spider extends Logger implements Runnable, Constants {
038: /** Config for the spider */
039: private SpiderConfig config;
040: /**
041: * Download queue.
042: * Thread safety: To access the queue, first synchronize on it.
043: */
044: private DownloadQueue queue;
045: /**
046: * Set of URLs downloaded or scheduled, so we don't download a
047: * URL more than once.
048: * Thread safety: To access the set, first synchronize on it.
049: */
050: private Set urlsDownloadedOrScheduled;
051: /**
052: * Set of URLs currently being downloaded by Spider threads.
053: * Thread safety: To access the set, first synchronize on it.
054: */
055: private Set urlsDownloading;
056: /**
057: * Number of downloads currently taking place.
058: * Thread safety: To modify this value, first synchronize on
059: * the download queue.
060: */
061: private int downloadsInProgress;
062: /** Whether the spider should quit */
063: private boolean quit;
064: /** Count of running Spider threads. */
065: private int running;
066: /** Time we last checkpointed. */
067: private long lastCheckpoint;
068:
069: public Spider(SpiderConfig config) {
070: this .config = config;
071: queue = new DownloadQueue(config);
072: queue.queueURL(new URLToDownload(config.getStartLocation(), 0));
073: urlsDownloadedOrScheduled = new HashSet();
074: urlsDownloading = new HashSet();
075: downloadsInProgress = 0;
076: lastCheckpoint = 0;
077: }
078:
079: public void start() {
080: quit = false;
081: running = 0;
082:
083: for (int i = 0; i < config.getSpiderThreads(); i++) {
084: _logClass.info("Starting Spider thread");
085: Thread t = new Thread(this , "Spider-Thread-" + (i + 1));
086: t.start();
087: running++;
088: }
089: }
090:
091: public void stop() {
092: quit = true;
093: }
094:
095: public boolean isRunning() {
096: return running == 0;
097: }
098:
099: private void checkpointIfNeeded() {
100: if (config.getCheckpointInterval() == 0) {
101: return;
102: }
103:
104: if (System.currentTimeMillis() - lastCheckpoint > config
105: .getCheckpointInterval()) {
106: synchronized (queue) {
107: if (System.currentTimeMillis() - lastCheckpoint > config
108: .getCheckpointInterval()) {
109: writeCheckpoint();
110: lastCheckpoint = System.currentTimeMillis();
111: }
112: }
113: }
114: }
115:
116: private void writeCheckpoint() {
117: _logClass.debug("writeCheckpoint()");
118: try {
119: FileOutputStream fos = new FileOutputStream(
120: "spider.checkpoint", false);
121: ObjectOutputStream oos = new ObjectOutputStream(fos);
122: oos.writeObject(queue);
123: oos.writeObject(urlsDownloading);
124: oos.close();
125: } catch (IOException ioe) {
126: _logClass.warn("IO Exception attempting checkpoint: "
127: + ioe.getMessage(), ioe);
128: }
129: }
130:
131: public void readCheckpoint() {
132: try {
133: FileInputStream fis = new FileInputStream(
134: "spider.checkpoint");
135: ObjectInputStream ois = new ObjectInputStream(fis);
136: queue = (DownloadQueue) ois.readObject();
137: urlsDownloading = (Set) ois.readObject();
138: queue.queueURLs(urlsDownloading);
139: urlsDownloading.clear();
140: } catch (Exception e) {
141: _logClass.error("Caught exception reading checkpoint: "
142: + e.getMessage(), e);
143: }
144: }
145:
146: public void run() {
147: HTMLParser htmlParser = new HTMLParser(config);
148: URLGetter urlGetter = new URLGetter(config);
149:
150: while ((queueSize() > 0 || downloadsInProgress > 0)
151: && quit == false) {
152: checkpointIfNeeded();
153: if (queueSize() == 0 && downloadsInProgress > 0) {
154: // Wait for a download to finish before seeing if this thread should stop
155: try {
156: Thread.sleep(QUEUE_CHECK_INTERVAL);
157: } catch (InterruptedException ignored) {
158: }
159: // Have another go at the loop
160: continue;
161: } else if (queueSize() == 0) {
162: break;
163: }
164: URLToDownload nextURL;
165: synchronized (queue) {
166: nextURL = queue.getNextInQueue();
167: downloadsInProgress++;
168: }
169: synchronized (urlsDownloading) {
170: urlsDownloading.add(nextURL);
171: }
172: int newDepth = nextURL.getDepth() + 1;
173: int maxDepth = config.getMaxDepth();
174: synchronized (urlsDownloading) {
175: urlsDownloading.remove(nextURL);
176: }
177: List newURLs = downloadURL(nextURL, urlGetter, htmlParser);
178:
179: newURLs = filterURLs(newURLs);
180:
181: ArrayList u2dsToQueue = new ArrayList();
182: for (Iterator i = newURLs.iterator(); i.hasNext();) {
183: URL u = (URL) i.next();
184: // Download if not yet downloaded, and the new depth is less than the maximum
185: synchronized (urlsDownloadedOrScheduled) {
186: if (!urlsDownloadedOrScheduled.contains(u)
187: && (maxDepth == 0 || newDepth <= maxDepth)) {
188: u2dsToQueue.add(new URLToDownload(u, nextURL
189: .getURL(), newDepth));
190: urlsDownloadedOrScheduled.add(u);
191: }
192: }
193: }
194: synchronized (queue) {
195: queue.queueURLs(u2dsToQueue);
196: downloadsInProgress--;
197: }
198: }
199: _logClass.info("Spider thread stopping");
200: running--;
201: }
202:
203: /**
204: * Get the size of the download queue in a thread-safe manner.
205: */
206: private int queueSize() {
207: synchronized (queue) {
208: return queue.size();
209: }
210: }
211:
212: /**
213: * Get a URL, and return new URLs that are referenced from it.
214: *
215: * @return A List of URL objects.
216: */
217: private List downloadURL(URLToDownload url, URLGetter urlGetter,
218: HTMLParser htmlParser) {
219: _logClass.debug("downloadURL(" + url + ")");
220:
221: // Bail out early if image and already on disk
222: URLObject obj = new URLObject(url.getURL(), config);
223: if (obj.existsOnDisk()) {
224: if (config.refreshHTMLs() && (obj.isHTML() || obj.isXML())) {
225: _logClass.info("Q: [" + queue + "] " + url);
226: obj = urlGetter.getURL(url);
227: } else if (config.refreshImages() && obj.isImage()) {
228: _logClass.info("Q: [" + queue + "] " + url);
229: obj = urlGetter.getURL(url);
230: }
231: } else {
232: _logClass.info("Q: [" + queue + "] " + url);
233: obj = urlGetter.getURL(url);
234: }
235:
236: if (obj == null) {
237: return new ArrayList();
238: }
239:
240: if (!obj.existsOnDisk()) {
241: obj.writeToFile();
242: }
243:
244: if (obj.isHTML() || obj.isXML()) {
245: return htmlParser.parseLinksInDocument(url.getURL(), obj
246: .getStringContent());
247: } else if (obj.isImage()) {
248: return new ArrayList();
249: } else {
250: _logClass.warn("Unsupported content type received: "
251: + obj.getContentType());
252: _logClass.info("URL was " + url);
253: return new ArrayList();
254: }
255: }
256:
257: private List filterURLs(List URLs) {
258: String match = config.getURLMatch();
259: ArrayList retVal = new ArrayList();
260:
261: synchronized (urlsDownloadedOrScheduled) {
262: for (Iterator i = URLs.iterator(); i.hasNext();) {
263: URL u = (URL) i.next();
264: if (urlsDownloadedOrScheduled.contains(u)) {
265: continue;
266: }
267:
268: String s = u.toExternalForm();
269: if (s.indexOf(match) != -1) {
270: retVal.add(u);
271: }
272: }
273: }
274: return retVal;
275: }
276:
277: }
|