001: // plasmaCrawlStacker.java
002: // -----------------------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2005
007: //
008: // This file was contributed by Martin Thelian
009: // ([MC] removed all multithreading and thread pools, this is not necessary here; complete renovation 2007)
010: //
011: // $LastChangedDate: 2008-01-20 21:42:35 +0000 (So, 20 Jan 2008) $
012: // $LastChangedRevision: 4349 $
013: // $LastChangedBy: orbiter $
014: //
015: // This program is free software; you can redistribute it and/or modify
016: // it under the terms of the GNU General Public License as published by
017: // the Free Software Foundation; either version 2 of the License, or
018: // (at your option) any later version.
019: //
020: // This program is distributed in the hope that it will be useful,
021: // but WITHOUT ANY WARRANTY; without even the implied warranty of
022: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
023: // GNU General Public License for more details.
024: //
025: // You should have received a copy of the GNU General Public License
026: // along with this program; if not, write to the Free Software
027: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
028: //
029: // Using this software in any meaning (reading, learning, copying, compiling,
030: // running) means that you agree that the Author(s) is (are) not responsible
031: // for cost, loss of data or any harm that may be caused directly or indirectly
032: // by usage of this softare or this documentation. The usage of this software
033: // is on your own risk. The installation and usage (starting/running) of this
034: // software may allow other people or application to access your computer and
035: // any attached devices and is highly dependent on the configuration of the
036: // software which must be done by the user of the software; the author(s) is
037: // (are) also not responsible for proper configuration and usage of the
038: // software, even if provoked by documentation provided together with
039: // the software.
040: //
041: // Any changes to this file according to the GPL as documented in the file
042: // gpl.txt aside this file in the shipment you received can be done to the
043: // lines that follows this copyright notice here, but changes must not be
044: // done inside the copyright notive above. A re-distribution must contain
045: // the intact and unchanged copyright notice.
046: // Contributions and changes to the program code must be marked as such.
047:
048: package de.anomic.plasma;
049:
050: import java.io.File;
051: import java.io.IOException;
052: import java.net.UnknownHostException;
053: import java.util.ArrayList;
054: import java.util.Date;
055: import java.util.Iterator;
056: import java.util.LinkedList;
057:
058: import de.anomic.index.indexURLEntry;
059: import de.anomic.kelondro.kelondroCache;
060: import de.anomic.kelondro.kelondroEcoTable;
061: import de.anomic.kelondro.kelondroException;
062: import de.anomic.kelondro.kelondroIndex;
063: import de.anomic.kelondro.kelondroRow;
064: import de.anomic.kelondro.kelondroRowSet;
065: import de.anomic.kelondro.kelondroTree;
066: import de.anomic.plasma.urlPattern.plasmaURLPattern;
067: import de.anomic.server.serverDomains;
068: import de.anomic.server.logging.serverLog;
069: import de.anomic.yacy.yacyCore;
070: import de.anomic.yacy.yacyURL;
071:
072: public final class plasmaCrawlStacker extends Thread {
073:
074: private static final int EcoFSBufferSize = 20;
075: private static String stackfile = "urlNoticeStacker9.db";
076:
077: // keys for different database types
078: public static final int QUEUE_DB_TYPE_RAM = 0;
079: public static final int QUEUE_DB_TYPE_TREE = 1;
080: public static final int QUEUE_DB_TYPE_ECO = 2;
081:
082: final serverLog log = new serverLog("STACKCRAWL");
083:
084: private plasmaSwitchboard sb;
085: private final LinkedList<String> urlEntryHashCache;
086: private kelondroIndex urlEntryCache;
087: private File cacheStacksPath;
088: private long preloadTime;
089: private int dbtype;
090: private boolean prequeue;
091: private long dnsHit, dnsMiss;
092: private int alternateCount;
093:
094: // objects for the prefetch task
095: private ArrayList<String> dnsfetchHosts = new ArrayList<String>();
096:
097: public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath,
098: long preloadTime, int dbtype, boolean prequeue) {
099: this .sb = sb;
100: this .prequeue = prequeue;
101: this .dnsHit = 0;
102: this .dnsMiss = 0;
103: this .alternateCount = 0;
104:
105: // init the message list
106: this .urlEntryHashCache = new LinkedList<String>();
107:
108: // create a stack for newly entered entries
109: this .cacheStacksPath = dbPath;
110: this .preloadTime = preloadTime;
111: this .dbtype = dbtype;
112:
113: openDB();
114: try {
115: // loop through the list and fill the messageList with url hashs
116: Iterator<kelondroRow.Entry> rows = this .urlEntryCache.rows(
117: true, null);
118: kelondroRow.Entry entry;
119: while (rows.hasNext()) {
120: entry = (kelondroRow.Entry) rows.next();
121: if (entry == null) {
122: System.out.println("ERROR! null element found");
123: continue;
124: }
125: this .urlEntryHashCache.add(entry.getColString(0, null));
126: }
127: } catch (kelondroException e) {
128: /* if we have an error, we start with a fresh database */
129: plasmaCrawlStacker.this .log.logSevere(
130: "Unable to initialize crawl stacker queue, kelondroException:"
131: + e.getMessage() + ". Reseting DB.\n", e);
132:
133: // deleting old db and creating a new db
134: try {
135: this .urlEntryCache.close();
136: } catch (Exception ex) {
137: }
138: deleteDB();
139: openDB();
140: } catch (IOException e) {
141: /* if we have an error, we start with a fresh database */
142: plasmaCrawlStacker.this .log.logSevere(
143: "Unable to initialize crawl stacker queue, IOException:"
144: + e.getMessage() + ". Reseting DB.\n", e);
145:
146: // deleting old db and creating a new db
147: try {
148: this .urlEntryCache.close();
149: } catch (Exception ex) {
150: }
151: deleteDB();
152: openDB();
153: }
154: this .log.logInfo(size() + " entries in the stackCrawl queue.");
155: this .start(); // start the prefetcher thread
156: this .log.logInfo("STACKCRAWL thread initialized.");
157: }
158:
159: public void run() {
160: String nextHost;
161: try {
162: while (!Thread.currentThread().isInterrupted()) { // action loop
163: if (dnsfetchHosts.size() == 0)
164: synchronized (this ) {
165: wait();
166: }
167: synchronized (dnsfetchHosts) {
168: nextHost = (String) dnsfetchHosts
169: .remove(dnsfetchHosts.size() - 1);
170: }
171: try {
172: serverDomains.dnsResolve(nextHost);
173: } catch (Exception e) {
174: }
175: }
176: } catch (InterruptedException e) {
177: }
178: }
179:
180: public boolean prefetchHost(String host) {
181: // returns true when the host was known in the dns cache.
182: // If not, the host is stacked on the fetch stack and false is returned
183: try {
184: serverDomains.dnsResolveFromCache(host);
185: return true;
186: } catch (UnknownHostException e) {
187: synchronized (this ) {
188: dnsfetchHosts.add(host);
189: notifyAll();
190: }
191: return false;
192: }
193: }
194:
195: public void terminateDNSPrefetcher() {
196: synchronized (this ) {
197: interrupt();
198: }
199: }
200:
201: public void close() {
202: if (this .dbtype == QUEUE_DB_TYPE_RAM) {
203: this .log.logFine("Shutdown. Flushing remaining " + size()
204: + " crawl stacker job entries. please wait.");
205: while (size() > 0) {
206: if (!job())
207: break;
208: }
209: }
210: terminateDNSPrefetcher();
211:
212: this .log.logFine("Shutdown. Closing stackCrawl queue.");
213:
214: // closing the db
215: this .urlEntryCache.close();
216:
217: // clearing the hash list
218: this .urlEntryHashCache.clear();
219: }
220:
221: public boolean job() {
222: plasmaCrawlEntry entry;
223: try {
224: entry = dequeueEntry();
225: } catch (IOException e) {
226: e.printStackTrace();
227: return false;
228: }
229: if (entry == null)
230: return false;
231:
232: try {
233:
234: String rejectReason = sb.crawlStacker.stackCrawl(entry);
235:
236: // if the url was rejected we store it into the error URL db
237: if (rejectReason != null) {
238: plasmaCrawlZURL.Entry ee = sb.crawlQueues.errorURL
239: .newEntry(entry, yacyCore.seedDB.mySeed().hash,
240: null, 0, rejectReason);
241: ee.store();
242: sb.crawlQueues.errorURL.push(ee);
243: }
244: } catch (Exception e) {
245: plasmaCrawlStacker.this .log.logWarning(
246: "Error while processing stackCrawl entry.\n"
247: + "Entry: " + entry.toString() + "Error: "
248: + e.toString(), e);
249: return false;
250: }
251: return true;
252: }
253:
254: public void enqueueEntry(yacyURL nexturl, String referrerhash,
255: String initiatorHash, String name, Date loadDate,
256: int currentdepth, plasmaCrawlProfile.entry profile) {
257: if (profile == null)
258: return;
259: plasmaCrawlEntry newEntry = new plasmaCrawlEntry(initiatorHash,
260: nexturl, referrerhash, name, loadDate,
261: profile.handle(), currentdepth, 0, 0);
262:
263: if (newEntry == null)
264: return;
265:
266: synchronized (this .urlEntryHashCache) {
267: kelondroRow.Entry oldValue;
268: boolean hostknown = true;
269: if (prequeue)
270: hostknown = prefetchHost(nexturl.getHost());
271: try {
272: oldValue = this .urlEntryCache.put(newEntry.toRow());
273: } catch (IOException e) {
274: oldValue = null;
275: }
276: if (oldValue == null) {
277: //System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
278: if (hostknown) {
279: this .alternateCount++;
280: this .urlEntryHashCache.addFirst(newEntry.url()
281: .hash());
282: this .dnsHit++;
283: } else {
284: if ((this .dnsMiss > 0)
285: && (this .alternateCount > 2 * this .dnsHit
286: / this .dnsMiss)) {
287: this .urlEntryHashCache.addFirst(newEntry.url()
288: .hash());
289: this .alternateCount = 0;
290: //System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss));
291: } else {
292: this .urlEntryHashCache.addLast(newEntry.url()
293: .hash());
294: }
295: this .dnsMiss++;
296: }
297: }
298: }
299: }
300:
301: private void deleteDB() {
302: if (this .dbtype == QUEUE_DB_TYPE_RAM) {
303: // do nothing..
304: }
305: if (this .dbtype == QUEUE_DB_TYPE_ECO) {
306: new File(cacheStacksPath, stackfile).delete();
307: //kelondroFlexWidthArray.delete(cacheStacksPath, stackfile);
308: }
309: if (this .dbtype == QUEUE_DB_TYPE_TREE) {
310: File cacheFile = new File(cacheStacksPath, stackfile);
311: cacheFile.delete();
312: }
313: }
314:
315: private void openDB() {
316: if (!(cacheStacksPath.exists()))
317: cacheStacksPath.mkdir(); // make the path
318:
319: if (this .dbtype == QUEUE_DB_TYPE_RAM) {
320: this .urlEntryCache = new kelondroRowSet(
321: plasmaCrawlEntry.rowdef, 0);
322: }
323: if (this .dbtype == QUEUE_DB_TYPE_ECO) {
324: cacheStacksPath.mkdirs();
325: File f = new File(cacheStacksPath, stackfile);
326: try {
327: this .urlEntryCache = new kelondroEcoTable(f,
328: plasmaCrawlEntry.rowdef,
329: kelondroEcoTable.tailCacheUsageAuto,
330: EcoFSBufferSize, 0);
331: //this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef, 0, true));
332: } catch (Exception e) {
333: e.printStackTrace();
334: // kill DB and try again
335: f.delete();
336: //kelondroFlexTable.delete(cacheStacksPath, newCacheName);
337: try {
338: this .urlEntryCache = new kelondroEcoTable(f,
339: plasmaCrawlEntry.rowdef,
340: kelondroEcoTable.tailCacheUsageAuto,
341: EcoFSBufferSize, 0);
342: //this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef, 0, true));
343: } catch (Exception ee) {
344: ee.printStackTrace();
345: System.exit(-1);
346: }
347: }
348: }
349: if (this .dbtype == QUEUE_DB_TYPE_TREE) {
350: File cacheFile = new File(cacheStacksPath, stackfile);
351: cacheFile.getParentFile().mkdirs();
352: this .urlEntryCache = new kelondroCache(kelondroTree.open(
353: cacheFile, true, preloadTime,
354: plasmaCrawlEntry.rowdef));
355: }
356: }
357:
358: public int size() {
359: synchronized (this .urlEntryHashCache) {
360: return this .urlEntryHashCache.size();
361: }
362: }
363:
364: public int getDBType() {
365: return this .dbtype;
366: }
367:
368: public plasmaCrawlEntry dequeueEntry() throws IOException {
369: if (this .urlEntryHashCache.size() == 0)
370: return null;
371: String urlHash = null;
372: kelondroRow.Entry entry = null;
373: synchronized (this .urlEntryHashCache) {
374: urlHash = (String) this .urlEntryHashCache.removeFirst();
375: if (urlHash == null)
376: throw new IOException("urlHash is null");
377: entry = this .urlEntryCache
378: .remove(urlHash.getBytes(), false);
379: }
380:
381: if ((urlHash == null) || (entry == null))
382: return null;
383: return new plasmaCrawlEntry(entry);
384: }
385:
386: public String stackCrawl(yacyURL url, yacyURL referrer,
387: String initiatorHash, String name, Date loadDate,
388: int currentdepth, plasmaCrawlProfile.entry profile) {
389: // stacks a crawl item. The position can also be remote
390: // returns null if successful, a reason string if not successful
391: //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
392:
393: // add the url into the crawling queue
394: plasmaCrawlEntry entry = new plasmaCrawlEntry(initiatorHash, // initiator, needed for p2p-feedback
395: url, // url clear text string
396: (referrer == null) ? null : referrer.hash(), // last url in crawling queue
397: name, // load date
398: loadDate, // the anchor name
399: (profile == null) ? null : profile.handle(), // profile must not be null!
400: currentdepth, // depth so far
401: 0, // anchors, default value
402: 0 // forkfactor, default value
403: );
404: return stackCrawl(entry);
405: }
406:
407: public String stackCrawl(plasmaCrawlEntry entry) {
408: // stacks a crawl item. The position can also be remote
409: // returns null if successful, a reason string if not successful
410: //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
411:
412: long startTime = System.currentTimeMillis();
413: String reason = null; // failure reason
414:
415: // check if the protocol is supported
416: String urlProtocol = entry.url().getProtocol();
417: if (!sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
418: reason = plasmaCrawlEURL.DENIED_UNSUPPORTED_PROTOCOL;
419: this .log.logSevere("Unsupported protocol in URL '"
420: + entry.url().toString() + "'. "
421: + "Stack processing time: "
422: + (System.currentTimeMillis() - startTime) + "ms");
423: return reason;
424: }
425:
426: // check if ip is local ip address
427: if (!sb.acceptURL(entry.url())) {
428: reason = plasmaCrawlEURL.DENIED_IP_ADDRESS_NOT_IN_DECLARED_DOMAIN
429: + "["
430: + sb.getConfig("network.unit.domain", "unknown")
431: + "]";
432: this .log.logFine("Host in URL '" + entry.url().toString()
433: + "' has IP address outside of declared range ("
434: + sb.getConfig("network.unit.domain", "unknown")
435: + "). " + "Stack processing time: "
436: + (System.currentTimeMillis() - startTime) + "ms");
437: return reason;
438: }
439:
440: // check blacklist
441: if (plasmaSwitchboard.urlBlacklist.isListed(
442: plasmaURLPattern.BLACKLIST_CRAWLER, entry.url())) {
443: reason = plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST;
444: this .log.logFine("URL '" + entry.url().toString()
445: + "' is in blacklist. " + "Stack processing time: "
446: + (System.currentTimeMillis() - startTime) + "ms");
447: return reason;
448: }
449:
450: plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls
451: .getEntry(entry.profileHandle());
452: if (profile == null) {
453: String errorMsg = "LOST PROFILE HANDLE '"
454: + entry.profileHandle() + "' for URL "
455: + entry.url();
456: log.logWarning(errorMsg);
457: return errorMsg;
458: }
459:
460: // filter deny
461: if ((entry.depth() > 0)
462: && (profile != null)
463: && (!(entry.url().toString().matches(profile
464: .generalFilter())))) {
465: reason = plasmaCrawlEURL.DENIED_URL_DOES_NOT_MATCH_FILTER;
466:
467: this .log.logFine("URL '" + entry.url().toString()
468: + "' does not match crawling filter '"
469: + profile.generalFilter() + "'. "
470: + "Stack processing time: "
471: + (System.currentTimeMillis() - startTime) + "ms");
472: return reason;
473: }
474:
475: // deny cgi
476: if (entry.url().isCGI()) {
477: reason = plasmaCrawlEURL.DENIED_CGI_URL;
478:
479: this .log.logFine("URL '" + entry.url().toString()
480: + "' is CGI URL. " + "Stack processing time: "
481: + (System.currentTimeMillis() - startTime) + "ms");
482: return reason;
483: }
484:
485: // deny post properties
486: if ((entry.url().isPOST()) && (profile != null)
487: && (!(profile.crawlingQ()))) {
488: reason = plasmaCrawlEURL.DENIED_POST_URL;
489:
490: this .log.logFine("URL '" + entry.url().toString()
491: + "' is post URL. " + "Stack processing time: "
492: + (System.currentTimeMillis() - startTime) + "ms");
493: return reason;
494: }
495:
496: yacyURL referrerURL = (entry.referrerhash() == null) ? null
497: : sb.crawlQueues.getURL(entry.referrerhash());
498:
499: // add domain to profile domain list
500: if ((profile.domFilterDepth() != Integer.MAX_VALUE)
501: || (profile.domMaxPages() != Integer.MAX_VALUE)) {
502: profile.domInc(entry.url().getHost(),
503: (referrerURL == null) ? null : referrerURL
504: .getHost().toLowerCase(), entry.depth());
505: }
506:
507: // deny urls that do not match with the profile domain list
508: if (!(profile.grantedDomAppearance(entry.url().getHost()))) {
509: reason = plasmaCrawlEURL.DENIED_NO_MATCH_WITH_DOMAIN_FILTER;
510: this .log.logFine("URL '" + entry.url().toString()
511: + "' is not listed in granted domains. "
512: + "Stack processing time: "
513: + (System.currentTimeMillis() - startTime) + "ms");
514: return reason;
515: }
516:
517: // deny urls that exceed allowed number of occurrences
518: if (!(profile.grantedDomCount(entry.url().getHost()))) {
519: reason = plasmaCrawlEURL.DENIED_DOMAIN_COUNT_EXCEEDED;
520: this .log.logFine("URL '" + entry.url().toString()
521: + "' appeared too often, a maximum of "
522: + profile.domMaxPages() + " is allowed. "
523: + "Stack processing time: "
524: + (System.currentTimeMillis() - startTime) + "ms");
525: return reason;
526: }
527:
528: // check if the url is double registered
529: String dbocc = sb.crawlQueues.urlExists(entry.url().hash());
530: indexURLEntry oldEntry = this .sb.wordIndex.loadedURL.load(entry
531: .url().hash(), null, 0);
532: boolean recrawl = (oldEntry != null)
533: && ((System.currentTimeMillis() - oldEntry.loaddate()
534: .getTime()) > profile.recrawlIfOlder());
535: // do double-check
536: if ((dbocc != null) && (!recrawl)) {
537: reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
538: this .log.logFine("URL '" + entry.url().toString()
539: + "' is double registered in '" + dbocc + "'. "
540: + "Stack processing time: "
541: + (System.currentTimeMillis() - startTime) + "ms");
542: return reason;
543: }
544: if ((oldEntry != null) && (!recrawl)) {
545: reason = plasmaCrawlEURL.DOUBLE_REGISTERED + "LURL)";
546: this .log.logFine("URL '" + entry.url().toString()
547: + "' is double registered in 'LURL'. "
548: + "Stack processing time: "
549: + (System.currentTimeMillis() - startTime) + "ms");
550: return reason;
551: }
552:
553: // show potential re-crawl
554: if (recrawl) {
555: this .log.logFine("RE-CRAWL of URL '"
556: + entry.url().toString()
557: + "': this url was crawled "
558: + ((System.currentTimeMillis() - oldEntry
559: .loaddate().getTime()) / 60000 / 60 / 24)
560: + " days ago.");
561: }
562:
563: // store information
564: boolean local = ((entry.initiator().equals(yacyURL.dummyHash)) || (entry
565: .initiator().equals(yacyCore.seedDB.mySeed().hash)));
566: boolean global = (profile != null)
567: && (profile.remoteIndexing()) /* granted */
568: && (entry.depth() == profile.generalDepth()) /* leaf node */
569: &&
570: //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
571: ((yacyCore.seedDB.mySeed().isSenior()) || (yacyCore.seedDB
572: .mySeed().isPrincipal())) /* qualified */;
573:
574: if ((!local)
575: && (!global)
576: && (!profile.handle().equals(
577: this .sb.defaultRemoteProfile.handle()))) {
578: this .log.logSevere("URL '" + entry.url().toString()
579: + "' can neither be crawled local nor global.");
580: }
581:
582: // add the url into the crawling queue
583: sb.crawlQueues.noticeURL
584: .push(
585: ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT
586: : ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE
587: : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/,
588: entry);
589: return null;
590: }
591:
592: }
|