001: // plasmaCrawlQueues.java
002: // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 29.10.2007 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.plasma.crawler;
028:
029: import java.io.File;
030: import java.io.IOException;
031: import java.net.MalformedURLException;
032: import java.text.ParseException;
033: import java.util.ArrayList;
034: import java.util.Collections;
035: import java.util.Date;
036: import java.util.HashMap;
037: import java.util.Iterator;
038: import java.util.Map;
039:
040: import de.anomic.data.robotsParser;
041: import de.anomic.plasma.plasmaCrawlEntry;
042: import de.anomic.plasma.plasmaCrawlNURL;
043: import de.anomic.plasma.plasmaCrawlProfile;
044: import de.anomic.plasma.plasmaCrawlZURL;
045: import de.anomic.plasma.plasmaHTCache;
046: import de.anomic.plasma.plasmaParser;
047: import de.anomic.plasma.plasmaSwitchboard;
048: import de.anomic.server.serverDate;
049: import de.anomic.server.logging.serverLog;
050: import de.anomic.xml.rssReader;
051: import de.anomic.yacy.yacyClient;
052: import de.anomic.yacy.yacyCore;
053: import de.anomic.yacy.yacySeed;
054: import de.anomic.yacy.yacyURL;
055:
056: public class plasmaCrawlQueues {
057:
058: private plasmaSwitchboard sb;
059: private serverLog log;
060: private Map<Integer, crawlWorker> workers; // mapping from url hash to Worker thread object
061: private plasmaProtocolLoader loader;
062: private ArrayList<String> remoteCrawlProviderHashes;
063:
064: public plasmaCrawlNURL noticeURL;
065: public plasmaCrawlZURL errorURL, delegatedURL;
066:
067: public plasmaCrawlQueues(plasmaSwitchboard sb, File plasmaPath) {
068: this .sb = sb;
069: this .log = new serverLog("CRAWLER");
070: this .workers = Collections
071: .synchronizedMap(new HashMap<Integer, crawlWorker>());
072: this .loader = new plasmaProtocolLoader(sb, log);
073: this .remoteCrawlProviderHashes = new ArrayList<String>();
074:
075: // start crawling management
076: log.logConfig("Starting Crawling Management");
077: noticeURL = new plasmaCrawlNURL(plasmaPath);
078: //errorURL = new plasmaCrawlZURL(); // fresh error DB each startup; can be hold in RAM and reduces IO;
079: errorURL = new plasmaCrawlZURL(plasmaPath, "urlError2.db",
080: false);
081: delegatedURL = new plasmaCrawlZURL(plasmaPath,
082: "urlDelegated2.db", true);
083:
084: }
085:
086: public String urlExists(String hash) {
087: // tests if hash occurrs in any database
088: // if it exists, the name of the database is returned,
089: // if it not exists, null is returned
090: if (noticeURL.existsInStack(hash))
091: return "crawler";
092: if (delegatedURL.exists(hash))
093: return "delegated";
094: if (errorURL.exists(hash))
095: return "errors";
096: Iterator<crawlWorker> i = workers.values().iterator();
097: while (i.hasNext())
098: if (i.next().entry.url().hash().equals(hash))
099: return "worker";
100: return null;
101: }
102:
103: public void urlRemove(String hash) {
104: noticeURL.removeByURLHash(hash);
105: delegatedURL.remove(hash);
106: errorURL.remove(hash);
107: }
108:
109: public yacyURL getURL(String urlhash) {
110: if (urlhash.equals(yacyURL.dummyHash))
111: return null;
112: plasmaCrawlEntry ne = noticeURL.get(urlhash);
113: if (ne != null)
114: return ne.url();
115: plasmaCrawlZURL.Entry ee = delegatedURL.getEntry(urlhash);
116: if (ee != null)
117: return ee.url();
118: ee = errorURL.getEntry(urlhash);
119: if (ee != null)
120: return ee.url();
121: Iterator<crawlWorker> i = workers.values().iterator();
122: crawlWorker w;
123: while (i.hasNext()) {
124: w = i.next();
125: if (w.entry.url().hash().equals(urlhash))
126: return w.entry.url();
127: }
128: return null;
129: }
130:
131: public void close() {
132: // wait for all workers to finish
133: Iterator<crawlWorker> i = workers.values().iterator();
134: while (i.hasNext())
135: i.next().interrupt();
136: // TODO: wait some more time until all threads are finished
137: noticeURL.close();
138: errorURL.close();
139: delegatedURL.close();
140: }
141:
142: public plasmaCrawlEntry[] activeWorker() {
143: synchronized (workers) {
144: plasmaCrawlEntry[] w = new plasmaCrawlEntry[workers.size()];
145: int i = 0;
146: Iterator<crawlWorker> j = workers.values().iterator();
147: while (j.hasNext()) {
148: w[i++] = j.next().entry;
149: }
150: return w;
151: }
152: }
153:
154: public boolean isSupportedProtocol(String protocol) {
155: return loader.isSupportedProtocol(protocol);
156: }
157:
158: public int coreCrawlJobSize() {
159: return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
160: }
161:
162: public boolean coreCrawlJob() {
163:
164: boolean robinsonPrivateCase = ((sb.isRobinsonMode())
165: && (!sb
166: .getConfig(plasmaSwitchboard.CLUSTER_MODE, "")
167: .equals(
168: plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) && (!sb
169: .getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(
170: plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER)));
171:
172: if (((robinsonPrivateCase) || (coreCrawlJobSize() <= 20))
173: && (limitCrawlJobSize() > 0)) {
174: // move some tasks to the core crawl job so we have something to do
175: int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
176: for (int i = 0; i < toshift; i++) {
177: noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT,
178: plasmaCrawlNURL.STACK_TYPE_CORE);
179: }
180: log
181: .logInfo("shifted "
182: + toshift
183: + " jobs from global crawl to local crawl (coreCrawlJobSize()="
184: + coreCrawlJobSize()
185: + ", limitCrawlJobSize()="
186: + limitCrawlJobSize()
187: + ", cluster.mode="
188: + sb.getConfig(
189: plasmaSwitchboard.CLUSTER_MODE, "")
190: + ", robinsonMode="
191: + ((sb.isRobinsonMode()) ? "on" : "off"));
192: }
193:
194: if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
195: //log.logDebug("CoreCrawl: queue is empty");
196: return false;
197: }
198: if (sb.sbQueue.size() >= (int) sb.getConfigLong(
199: plasmaSwitchboard.INDEXER_SLOTS, 30)) {
200: log
201: .logFine("CoreCrawl: too many processes in indexing queue, dismissed ("
202: + "sbQueueSize=" + sb.sbQueue.size() + ")");
203: return false;
204: }
205: if (this .size() >= sb.getConfigLong(
206: plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
207: log
208: .logFine("CoreCrawl: too many processes in loader queue, dismissed ("
209: + "cacheLoader=" + this .size() + ")");
210: return false;
211: }
212: if (sb.onlineCaution()) {
213: log
214: .logFine("CoreCrawl: online caution, omitting processing");
215: return false;
216: }
217: // if the server is busy, we do crawling more slowly
218: //if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
219:
220: // if crawling was paused we have to wait until we wer notified to continue
221: Object[] status = (Object[]) sb.crawlJobsStatus
222: .get(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
223: synchronized (status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
224: if (((Boolean) status[plasmaSwitchboard.CRAWLJOB_STATUS])
225: .booleanValue()) {
226: try {
227: status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
228: } catch (InterruptedException e) {
229: return false;
230: }
231: }
232: }
233:
234: // do a local crawl
235: plasmaCrawlEntry urlEntry = null;
236: while (urlEntry == null
237: && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
238: String stats = "LOCALCRAWL["
239: + noticeURL
240: .stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)
241: + ", "
242: + noticeURL
243: .stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT)
244: + ", "
245: + noticeURL
246: .stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG)
247: + ", "
248: + noticeURL
249: .stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)
250: + "]";
251: try {
252: urlEntry = noticeURL.pop(
253: plasmaCrawlNURL.STACK_TYPE_CORE, true);
254: String profileHandle = urlEntry.profileHandle();
255: // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
256: // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
257: if (profileHandle == null) {
258: log.logSevere(stats + ": NULL PROFILE HANDLE '"
259: + urlEntry.profileHandle() + "' for URL "
260: + urlEntry.url());
261: return true;
262: }
263: plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls
264: .getEntry(profileHandle);
265: if (profile == null) {
266: log.logWarning(stats + ": LOST PROFILE HANDLE '"
267: + urlEntry.profileHandle() + "' for URL "
268: + urlEntry.url());
269: return true;
270: }
271:
272: // check if the protocol is supported
273: yacyURL url = urlEntry.url();
274: String urlProtocol = url.getProtocol();
275: if (!this .sb.crawlQueues
276: .isSupportedProtocol(urlProtocol)) {
277: this .log.logSevere("Unsupported protocol in URL '"
278: + url.toString());
279: return true;
280: }
281:
282: log
283: .logFine("LOCALCRAWL: URL="
284: + urlEntry.url()
285: + ", initiator="
286: + urlEntry.initiator()
287: + ", crawlOrder="
288: + ((profile.remoteIndexing()) ? "true"
289: : "false")
290: + ", depth="
291: + urlEntry.depth()
292: + ", crawlDepth="
293: + profile.generalDepth()
294: + ", filter="
295: + profile.generalFilter()
296: + ", permission="
297: + ((yacyCore.seedDB == null) ? "undefined"
298: : (((yacyCore.seedDB.mySeed()
299: .isSenior()) || (yacyCore.seedDB
300: .mySeed().isPrincipal())) ? "true"
301: : "false")));
302:
303: processLocalCrawling(urlEntry, stats);
304: return true;
305: } catch (IOException e) {
306: log.logSevere(stats + ": CANNOT FETCH ENTRY: "
307: + e.getMessage(), e);
308: if (e.getMessage().indexOf("hash is null") > 0)
309: noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
310: }
311: }
312: return true;
313: }
314:
315: public boolean remoteCrawlLoaderJob() {
316: // check if we are allowed to crawl urls provided by other peers
317: if (!yacyCore.seedDB.mySeed().getFlagAcceptRemoteCrawl()) {
318: //this.log.logInfo("remoteCrawlLoaderJob: not done, we are not allowed to do that");
319: return false;
320: }
321:
322: // check if we are a senior peer
323: if (!yacyCore.seedDB.mySeed().isActive()) {
324: //this.log.logInfo("remoteCrawlLoaderJob: not done, this should be a senior or principal peer");
325: return false;
326: }
327:
328: if (sb.sbQueue.size() >= (int) sb.getConfigLong(
329: plasmaSwitchboard.INDEXER_SLOTS, 30)) {
330: log
331: .logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed ("
332: + "sbQueueSize=" + sb.sbQueue.size() + ")");
333: return false;
334: }
335:
336: if (this .size() >= sb.getConfigLong(
337: plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
338: log
339: .logFine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed ("
340: + "cacheLoader=" + this .size() + ")");
341: return false;
342: }
343:
344: if (sb.onlineCaution()) {
345: log
346: .logFine("remoteCrawlLoaderJob: online caution, omitting processing");
347: return false;
348: }
349:
350: // check if we have an entry in the provider list, otherwise fill the list
351: yacySeed seed;
352: if ((remoteCrawlProviderHashes.size() == 0)
353: && (coreCrawlJobSize() == 0)
354: && (remoteTriggeredCrawlJobSize() == 0)
355: && (sb.queueSize() < 10)) {
356: if (yacyCore.seedDB != null
357: && yacyCore.seedDB.sizeConnected() > 0) {
358: Iterator<yacySeed> e = yacyCore.dhtAgent
359: .getProvidesRemoteCrawlURLs();
360: while (e.hasNext()) {
361: seed = e.next();
362: if (seed != null) {
363: remoteCrawlProviderHashes.add(seed.hash);
364:
365: }
366: }
367: }
368: }
369: if (remoteCrawlProviderHashes.size() == 0)
370: return false;
371:
372: // take one entry from the provider list and load the entries from the remote peer
373: seed = null;
374: String hash = null;
375: while ((seed == null) && (remoteCrawlProviderHashes.size() > 0)) {
376: hash = (String) remoteCrawlProviderHashes
377: .remove(remoteCrawlProviderHashes.size() - 1);
378: if (hash == null)
379: continue;
380: seed = yacyCore.seedDB.get(hash);
381: if (seed == null)
382: continue;
383: // check if the peer is inside our cluster
384: if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(seed))) {
385: seed = null;
386: continue;
387: }
388: }
389: if (seed == null)
390: return false;
391:
392: // we know a peer which should provide remote crawl entries. load them now.
393: rssReader reader = (seed == null) ? null : yacyClient
394: .queryRemoteCrawlURLs(seed, 10);
395: if (reader == null)
396: return true;
397: // parse the rss
398: rssReader.Item item;
399: yacyURL url, referrer;
400: Date loaddate;
401: for (int i = 0; i < reader.items(); i++) {
402: item = reader.getItem(i);
403: //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
404:
405: // put url on remote crawl stack
406: try {
407: url = new yacyURL(item.getLink(), null);
408: } catch (MalformedURLException e) {
409: url = null;
410: }
411: try {
412: referrer = new yacyURL(item.getReferrer(), null);
413: } catch (MalformedURLException e) {
414: referrer = null;
415: }
416: try {
417: loaddate = serverDate.parseShortSecond(item
418: .getPubDate());
419: } catch (ParseException e) {
420: loaddate = new Date();
421: }
422: if (sb.acceptURL(url)) {
423: // stack url
424: sb.getLog().logFinest(
425: "crawlOrder: stack: url='" + url + "'");
426: String reasonString = sb.crawlStacker.stackCrawl(url,
427: referrer, hash, item.getDescription(),
428: loaddate, 0, sb.defaultRemoteProfile);
429:
430: if (reasonString == null) {
431: // done
432: log.logInfo("crawlOrder: added remote crawl url: "
433: + url.toNormalform(true, false));
434: } else if (reasonString.startsWith("double")) {
435: // case where we have already the url loaded;
436: log
437: .logInfo("crawlOrder: ignored double remote crawl url: "
438: + url.toNormalform(true, false));
439: } else {
440: log.logInfo("crawlOrder: ignored [" + reasonString
441: + "] remote crawl url: "
442: + url.toNormalform(true, false));
443: }
444: } else {
445: log
446: .logWarning("crawlOrder: Received URL outside of our domain: "
447: + url.toNormalform(true, false));
448: }
449: }
450: return true;
451: }
452:
453: public int limitCrawlJobSize() {
454: return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
455: }
456:
457: public int remoteTriggeredCrawlJobSize() {
458: return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
459: }
460:
461: public boolean remoteTriggeredCrawlJob() {
462: // work off crawl requests that had been placed by other peers to our crawl stack
463:
464: // do nothing if either there are private processes to be done
465: // or there is no global crawl on the stack
466: if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
467: //log.logDebug("GlobalCrawl: queue is empty");
468: return false;
469: }
470: if (sb.sbQueue.size() >= (int) sb.getConfigLong(
471: plasmaSwitchboard.INDEXER_SLOTS, 30)) {
472: log
473: .logFine("GlobalCrawl: too many processes in indexing queue, dismissed ("
474: + "sbQueueSize=" + sb.sbQueue.size() + ")");
475: return false;
476: }
477: if (this .size() >= sb.getConfigLong(
478: plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
479: log
480: .logFine("GlobalCrawl: too many processes in loader queue, dismissed ("
481: + "cacheLoader=" + this .size() + ")");
482: return false;
483: }
484: if (sb.onlineCaution()) {
485: log
486: .logFine("GlobalCrawl: online caution, omitting processing");
487: return false;
488: }
489:
490: // if crawling was paused we have to wait until we wer notified to continue
491: Object[] status = (Object[]) sb.crawlJobsStatus
492: .get(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
493: synchronized (status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
494: if (((Boolean) status[plasmaSwitchboard.CRAWLJOB_STATUS])
495: .booleanValue()) {
496: try {
497: status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
498: } catch (InterruptedException e) {
499: return false;
500: }
501: }
502: }
503:
504: // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
505: String stats = "REMOTETRIGGEREDCRAWL["
506: + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)
507: + ", "
508: + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT)
509: + ", "
510: + noticeURL
511: .stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG)
512: + ", "
513: + noticeURL
514: .stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)
515: + "]";
516: try {
517: plasmaCrawlEntry urlEntry = noticeURL.pop(
518: plasmaCrawlNURL.STACK_TYPE_REMOTE, true);
519: String profileHandle = urlEntry.profileHandle();
520: // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
521: // profileHandle = " + profileHandle + ", urlEntry.url = " +
522: // urlEntry.url());
523: plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls
524: .getEntry(profileHandle);
525:
526: if (profile == null) {
527: log.logWarning(stats + ": LOST PROFILE HANDLE '"
528: + urlEntry.profileHandle() + "' for URL "
529: + urlEntry.url());
530: return false;
531: }
532:
533: // check if the protocol is supported
534: yacyURL url = urlEntry.url();
535: String urlProtocol = url.getProtocol();
536: if (!this .sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
537: this .log.logSevere("Unsupported protocol in URL '"
538: + url.toString());
539: return true;
540: }
541:
542: log
543: .logFine("plasmaSwitchboard.remoteTriggeredCrawlJob: url="
544: + urlEntry.url()
545: + ", initiator="
546: + urlEntry.initiator()
547: + ", crawlOrder="
548: + ((profile.remoteIndexing()) ? "true"
549: : "false")
550: + ", depth="
551: + urlEntry.depth()
552: + ", crawlDepth="
553: + profile.generalDepth()
554: + ", filter="
555: + profile.generalFilter()
556: + ", permission="
557: + ((yacyCore.seedDB == null) ? "undefined"
558: : (((yacyCore.seedDB.mySeed()
559: .isSenior()) || (yacyCore.seedDB
560: .mySeed().isPrincipal())) ? "true"
561: : "false")));
562:
563: processLocalCrawling(urlEntry, stats);
564: return true;
565: } catch (IOException e) {
566: log.logSevere(stats + ": CANNOT FETCH ENTRY: "
567: + e.getMessage(), e);
568: if (e.getMessage().indexOf("hash is null") > 0)
569: noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_REMOTE);
570: return true;
571: }
572: }
573:
574: private void processLocalCrawling(plasmaCrawlEntry entry,
575: String stats) {
576: // work off one Crawl stack entry
577: if ((entry == null) || (entry.url() == null)) {
578: log.logInfo(stats + ": urlEntry = null");
579: return;
580: }
581: new crawlWorker(entry);
582:
583: log.logInfo(stats + ": enqueued for load " + entry.url() + " ["
584: + entry.url().hash() + "]");
585: return;
586: }
587:
588: public plasmaHTCache.Entry loadResourceFromWeb(yacyURL url,
589: int socketTimeout, boolean keepInMemory, boolean forText) {
590:
591: plasmaCrawlEntry centry = new plasmaCrawlEntry(yacyCore.seedDB
592: .mySeed().hash, url, null, "", new Date(),
593: (forText) ? sb.defaultTextSnippetProfile.handle()
594: : sb.defaultMediaSnippetProfile.handle(), // crawl profile
595: 0, 0, 0);
596:
597: return loader.load(centry,
598: (forText) ? plasmaParser.PARSER_MODE_CRAWLER
599: : plasmaParser.PARSER_MODE_IMAGE);
600: }
601:
602: public int size() {
603: return workers.size();
604: }
605:
606: protected class crawlWorker extends Thread {
607:
608: public plasmaCrawlEntry entry;
609: private Integer code;
610:
611: public crawlWorker(plasmaCrawlEntry entry) {
612: this .entry = entry;
613: this .entry.setStatus("worker-initialized");
614: this .code = new Integer(entry.hashCode());
615: if (!workers.containsKey(code)) {
616: workers.put(code, this );
617: this .start();
618: }
619: }
620:
621: public void run() {
622: try {
623: // checking robots.txt for http(s) resources
624: this .entry.setStatus("worker-checkingrobots");
625: if ((entry.url().getProtocol().equals("http") || entry
626: .url().getProtocol().equals("https"))
627: && robotsParser.isDisallowed(entry.url())) {
628: log.logFine("Crawling of URL '"
629: + entry.url().toString()
630: + "' disallowed by robots.txt.");
631: plasmaCrawlZURL.Entry eentry = errorURL.newEntry(
632: this .entry.url(), "denied by robots.txt");
633: eentry.store();
634: errorURL.push(eentry);
635: } else {
636: // starting a load from the internet
637: this .entry.setStatus("worker-loading");
638: String result = loader.process(this .entry,
639: plasmaParser.PARSER_MODE_CRAWLER);
640: if (result != null) {
641: plasmaCrawlZURL.Entry eentry = errorURL
642: .newEntry(this .entry.url(),
643: "cannot load: " + result);
644: eentry.store();
645: errorURL.push(eentry);
646: } else {
647: this .entry.setStatus("worker-processed");
648: }
649: }
650: } catch (Exception e) {
651: plasmaCrawlZURL.Entry eentry = errorURL.newEntry(
652: this .entry.url(), e.getMessage()
653: + " - in worker");
654: eentry.store();
655: errorURL.push(eentry);
656: e.printStackTrace();
657: } finally {
658: workers.remove(code);
659: this .entry.setStatus("worker-finalized");
660: }
661: }
662:
663: }
664:
665: }
|