0001: // plasmaSwitchboard.java
0002: // (C) 2004-2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
0003: // first published 2004 on http://yacy.net
0004: //
0005: // This is a part of YaCy, a peer-to-peer based web search engine
0006: //
0007: // $LastChangedDate: 2008-01-31 23:40:47 +0000 (Do, 31 Jan 2008) $
0008: // $LastChangedRevision: 4424 $
0009: // $LastChangedBy: orbiter $
0010: //
0011: // LICENSE
0012: //
0013: // This program is free software; you can redistribute it and/or modify
0014: // it under the terms of the GNU General Public License as published by
0015: // the Free Software Foundation; either version 2 of the License, or
0016: // (at your option) any later version.
0017: //
0018: // This program is distributed in the hope that it will be useful,
0019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
0020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0021: // GNU General Public License for more details.
0022: //
0023: // You should have received a copy of the GNU General Public License
0024: // along with this program; if not, write to the Free Software
0025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0026:
0027: /*
0028: This class holds the run-time environment of the plasma
0029: Search Engine. It's data forms a blackboard which can be used
0030: to organize running jobs around the indexing algorithm.
0031: The blackboard consist of the following entities:
0032: - storage: one plasmaStore object with the url-based database
0033: - configuration: initialized by properties once, then by external functions
0034: - job queues: for parsing, condensing, indexing
0035: - black/blue/whitelists: controls input and output to the index
0036:
0037: this class is also the core of the http crawling.
0038: There are some items that need to be respected when crawling the web:
0039: 1) respect robots.txt
0040: 2) do not access one domain too frequently, wait between accesses
0041: 3) remember crawled URL's and do not access again too early
0042: 4) priorization of specific links should be possible (hot-lists)
0043: 5) attributes for crawling (depth, filters, hot/black-lists, priority)
0044: 6) different crawling jobs with different attributes ('Orders') simultanoulsy
0045:
0046: We implement some specific tasks and use different database to archieve these goals:
0047: - a database 'crawlerDisallow.db' contains all url's that shall not be crawled
0048: - a database 'crawlerDomain.db' holds all domains and access times, where we loaded the disallow tables
0049: this table contains the following entities:
0050: <flag: robotes exist/not exist, last access of robots.txt, last access of domain (for access scheduling)>
0051: - four databases for scheduled access: crawlerScheduledHotText.db, crawlerScheduledColdText.db,
0052: crawlerScheduledHotMedia.db and crawlerScheduledColdMedia.db
0053: - two stacks for new URLS: newText.stack and newMedia.stack
0054: - two databases for URL double-check: knownText.db and knownMedia.db
0055: - one database with crawling orders: crawlerOrders.db
0056:
0057: The Information flow of a single URL that is crawled is as follows:
0058: - a html file is loaded from a specific URL within the module httpdProxyServlet as
0059: a process of the proxy.
0060: - the file is passed to httpdProxyCache. Here it's processing is delayed until the proxy is idle.
0061: - The cache entry is passed on to the plasmaSwitchboard. There the URL is stored into plasmaLURL where
0062: the URL is stored under a specific hash. The URL's from the content are stripped off, stored in plasmaLURL
0063: with a 'wrong' date (the date of the URL's are not known at this time, only after fetching) and stacked with
0064: plasmaCrawlerTextStack. The content is read and splitted into rated words in plasmaCondenser.
0065: The splitted words are then integrated into the index with plasmaSearch.
0066: - In plasmaSearch the words are indexed by reversing the relation between URL and words: one URL points
0067: to many words, the words within the document at the URL. After reversing, one word points
0068: to many URL's, all the URL's where the word occurrs. One single word->URL-hash relation is stored in
0069: plasmaIndexEntry. A set of plasmaIndexEntries is a reverse word index.
0070: This reverse word index is stored temporarly in plasmaIndexCache.
0071: - In plasmaIndexCache the single plasmaIndexEntry'ies are collected and stored into a plasmaIndex - entry
0072: These plasmaIndex - Objects are the true reverse words indexes.
0073: - in plasmaIndex the plasmaIndexEntry - objects are stored in a kelondroTree; an indexed file in the file system.
0074:
0075: The information flow of a search request is as follows:
0076: - in httpdFileServlet the user enters a search query, which is passed to plasmaSwitchboard
0077: - in plasmaSwitchboard, the query is passed to plasmaSearch.
0078: - in plasmaSearch, the plasmaSearch.result object is generated by simultanous enumeration of
0079: URL hases in the reverse word indexes plasmaIndex
0080: - (future: the plasmaSearch.result - object is used to identify more key words for a new search)
0081:
0082:
0083:
0084: */
0085:
0086: package de.anomic.plasma;
0087:
0088: import java.io.File;
0089: import java.io.FileInputStream;
0090: import java.io.IOException;
0091: import java.io.InputStream;
0092: import java.io.UnsupportedEncodingException;
0093: import java.lang.reflect.Constructor;
0094: import java.net.InetAddress;
0095: import java.net.MalformedURLException;
0096: import java.text.SimpleDateFormat;
0097: import java.util.ArrayList;
0098: import java.util.Date;
0099: import java.util.HashMap;
0100: import java.util.Hashtable;
0101: import java.util.Iterator;
0102: import java.util.Locale;
0103: import java.util.Map;
0104: import java.util.Properties;
0105: import java.util.Set;
0106: import java.util.Timer;
0107: import java.util.TimerTask;
0108: import java.util.TreeMap;
0109: import java.util.TreeSet;
0110:
0111: import de.anomic.data.URLLicense;
0112: import de.anomic.data.blogBoard;
0113: import de.anomic.data.blogBoardComments;
0114: import de.anomic.data.bookmarksDB;
0115: import de.anomic.data.listManager;
0116: import de.anomic.data.messageBoard;
0117: import de.anomic.data.userDB;
0118: import de.anomic.data.wikiBoard;
0119: import de.anomic.data.wiki.wikiParser;
0120: import de.anomic.htmlFilter.htmlFilterContentScraper;
0121: import de.anomic.http.httpHeader;
0122: import de.anomic.http.httpRemoteProxyConfig;
0123: import de.anomic.http.httpc;
0124: import de.anomic.http.httpd;
0125: import de.anomic.http.httpdRobotsTxtConfig;
0126: import de.anomic.index.indexContainer;
0127: import de.anomic.index.indexRWIEntry;
0128: import de.anomic.index.indexRWIRowEntry;
0129: import de.anomic.index.indexURLEntry;
0130: import de.anomic.kelondro.kelondroBitfield;
0131: import de.anomic.kelondro.kelondroCache;
0132: import de.anomic.kelondro.kelondroCachedRecords;
0133: import de.anomic.kelondro.kelondroException;
0134: import de.anomic.kelondro.kelondroMSetTools;
0135: import de.anomic.kelondro.kelondroMapTable;
0136: import de.anomic.kelondro.kelondroNaturalOrder;
0137: import de.anomic.plasma.crawler.plasmaCrawlQueues;
0138: import de.anomic.plasma.crawler.plasmaProtocolLoader;
0139: import de.anomic.plasma.dbImport.dbImportManager;
0140: import de.anomic.plasma.parser.ParserException;
0141: import de.anomic.plasma.plasmaCondenser.wordStatProp;
0142: import de.anomic.plasma.urlPattern.defaultURLPattern;
0143: import de.anomic.plasma.urlPattern.plasmaURLPattern;
0144: import de.anomic.server.serverAbstractSwitch;
0145: import de.anomic.server.serverDomains;
0146: import de.anomic.server.serverFileUtils;
0147: import de.anomic.server.serverInstantThread;
0148: import de.anomic.server.serverMemory;
0149: import de.anomic.server.serverObjects;
0150: import de.anomic.server.serverProfiling;
0151: import de.anomic.server.serverSemaphore;
0152: import de.anomic.server.serverSwitch;
0153: import de.anomic.server.serverThread;
0154: import de.anomic.server.logging.serverLog;
0155: import de.anomic.tools.crypt;
0156: import de.anomic.yacy.yacyClient;
0157: import de.anomic.yacy.yacyCore;
0158: import de.anomic.yacy.yacyNewsPool;
0159: import de.anomic.yacy.yacyNewsRecord;
0160: import de.anomic.yacy.yacySeed;
0161: import de.anomic.yacy.yacyURL;
0162: import de.anomic.yacy.yacyVersion;
0163:
0164: public final class plasmaSwitchboard extends serverAbstractSwitch
0165: implements serverSwitch {
0166:
0167: // load slots
0168: public static int xstackCrawlSlots = 2000;
0169:
0170: private int dhtTransferIndexCount = 100;
0171:
0172: // we must distinguish the following cases: resource-load was initiated by
0173: // 1) global crawling: the index is extern, not here (not possible here)
0174: // 2) result of search queries, some indexes are here (not possible here)
0175: // 3) result of index transfer, some of them are here (not possible here)
0176: // 4) proxy-load (initiator is "------------")
0177: // 5) local prefetch/crawling (initiator is own seedHash)
0178: // 6) local fetching for global crawling (other known or unknown initiator)
0179: public static final int PROCESSCASE_0_UNKNOWN = 0;
0180: public static final int PROCESSCASE_1_GLOBAL_CRAWLING = 1;
0181: public static final int PROCESSCASE_2_SEARCH_QUERY_RESULT = 2;
0182: public static final int PROCESSCASE_3_INDEX_TRANSFER_RESULT = 3;
0183: public static final int PROCESSCASE_4_PROXY_LOAD = 4;
0184: public static final int PROCESSCASE_5_LOCAL_CRAWLING = 5;
0185: public static final int PROCESSCASE_6_GLOBAL_CRAWLING = 6;
0186:
0187: // couloured list management
0188: public static TreeSet<String> badwords = null;
0189: public static TreeSet<String> blueList = null;
0190: public static TreeSet<String> stopwords = null;
0191: public static plasmaURLPattern urlBlacklist;
0192:
0193: public static wikiParser wikiParser = null;
0194:
0195: // storage management
0196: public File htCachePath;
0197: private File plasmaPath;
0198: public File indexPrimaryPath, indexSecondaryPath;
0199: public File listsPath;
0200: public File htDocsPath;
0201: public File rankingPath;
0202: public File workPath;
0203: public File releasePath;
0204: public HashMap<String, String> rankingPermissions;
0205: public plasmaWordIndex wordIndex;
0206: public plasmaCrawlQueues crawlQueues;
0207: public plasmaSwitchboardQueue sbQueue;
0208: public plasmaCrawlStacker crawlStacker;
0209: public messageBoard messageDB;
0210: public wikiBoard wikiDB;
0211: public blogBoard blogDB;
0212: public blogBoardComments blogCommentDB;
0213: public static plasmaCrawlRobotsTxt robots;
0214: public plasmaCrawlProfile profilesActiveCrawls,
0215: profilesPassiveCrawls;
0216: public plasmaCrawlProfile.entry defaultProxyProfile;
0217: public plasmaCrawlProfile.entry defaultRemoteProfile;
0218: public plasmaCrawlProfile.entry defaultTextSnippetProfile;
0219: public plasmaCrawlProfile.entry defaultMediaSnippetProfile;
0220: public boolean rankingOn;
0221: public plasmaRankingDistribution rankingOwnDistribution;
0222: public plasmaRankingDistribution rankingOtherDistribution;
0223: public HashMap<String, Object[]> outgoingCookies, incomingCookies;
0224: public kelondroMapTable facilityDB;
0225: public plasmaParser parser;
0226: public long proxyLastAccess, localSearchLastAccess,
0227: remoteSearchLastAccess;
0228: public yacyCore yc;
0229: public HashMap<String, plasmaSwitchboardQueue.Entry> indexingTasksInProcess;
0230: public userDB userDB;
0231: public bookmarksDB bookmarksDB;
0232: public plasmaWebStructure webStructure;
0233: public dbImportManager dbImportManager;
0234: public plasmaDHTFlush transferIdxThread = null;
0235: private plasmaDHTChunk dhtTransferChunk = null;
0236: public ArrayList<HashMap<String, Object>> localSearches,
0237: remoteSearches; // array of search result properties as HashMaps
0238: public HashMap<String, TreeSet<Long>> localSearchTracker,
0239: remoteSearchTracker; // mappings from requesting host to a TreeSet of Long(access time)
0240: public long lastseedcheckuptime = -1;
0241: public long indexedPages = 0;
0242: public long lastindexedPages = 0;
0243: public double requestedQueries = 0d;
0244: public double lastrequestedQueries = 0d;
0245: public int totalPPM = 0;
0246: public double totalQPM = 0d;
0247: public TreeMap<String, String> clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used
0248: public boolean acceptLocalURLs, acceptGlobalURLs;
0249: public URLLicense licensedURLs;
0250: public Timer moreMemory;
0251:
0252: /*
0253: * Remote Proxy configuration
0254: */
0255: // public boolean remoteProxyUse;
0256: // public boolean remoteProxyUse4Yacy;
0257: // public String remoteProxyHost;
0258: // public int remoteProxyPort;
0259: // public String remoteProxyNoProxy = "";
0260: // public String[] remoteProxyNoProxyPatterns = null;
0261: public httpRemoteProxyConfig remoteProxyConfig = null;
0262:
0263: public httpdRobotsTxtConfig robotstxtConfig = null;
0264:
0265: /*
0266: * Some constants
0267: */
0268: public static final String STR_REMOTECRAWLTRIGGER = "REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER ";
0269:
0270: private serverSemaphore shutdownSync = new serverSemaphore(0);
0271: private boolean terminate = false;
0272:
0273: //private Object crawlingPausedSync = new Object();
0274: //private boolean crawlingIsPaused = false;
0275:
0276: public static final int CRAWLJOB_SYNC = 0;
0277: public static final int CRAWLJOB_STATUS = 1;
0278:
0279: //////////////////////////////////////////////////////////////////////////////////////////////
0280: // Thread settings
0281: //////////////////////////////////////////////////////////////////////////////////////////////
0282:
0283: // 20_dhtdistribution
0284: /**
0285: * <p><code>public static final String <strong>INDEX_DIST</strong> = "20_dhtdistribution"</code></p>
0286: * <p>Name of the DHT distribution thread, which selects index chunks and transfers them to other peers
0287: * according to the global DHT rules</p>
0288: */
0289: public static final String INDEX_DIST = "20_dhtdistribution";
0290: public static final String INDEX_DIST_METHOD_START = "dhtTransferJob";
0291: public static final String INDEX_DIST_METHOD_JOBCOUNT = null;
0292: public static final String INDEX_DIST_METHOD_FREEMEM = null;
0293: public static final String INDEX_DIST_MEMPREREQ = "20_dhtdistribution_memprereq";
0294: public static final String INDEX_DIST_IDLESLEEP = "20_dhtdistribution_idlesleep";
0295: public static final String INDEX_DIST_BUSYSLEEP = "20_dhtdistribution_busysleep";
0296:
0297: // 30_peerping
0298: /**
0299: * <p><code>public static final String <strong>PEER_PING</strong> = "30_peerping"</code></p>
0300: * <p>Name of the Peer Ping thread which publishes the own peer and retrieves information about other peers
0301: * connected to the YaCy-network</p>
0302: */
0303: public static final String PEER_PING = "30_peerping";
0304: public static final String PEER_PING_METHOD_START = "peerPing";
0305: public static final String PEER_PING_METHOD_JOBCOUNT = null;
0306: public static final String PEER_PING_METHOD_FREEMEM = null;
0307: public static final String PEER_PING_IDLESLEEP = "30_peerping_idlesleep";
0308: public static final String PEER_PING_BUSYSLEEP = "30_peerping_busysleep";
0309:
0310: // 40_peerseedcycle
0311: /**
0312: * <p><code>public static final String <strong>SEED_UPLOAD</strong> = "40_peerseedcycle"</code></p>
0313: * <p>Name of the seed upload thread, providing the so-called seed-lists needed during bootstrapping</p>
0314: */
0315: public static final String SEED_UPLOAD = "40_peerseedcycle";
0316: public static final String SEED_UPLOAD_METHOD_START = "publishSeedList";
0317: public static final String SEED_UPLOAD_METHOD_JOBCOUNT = null;
0318: public static final String SEED_UPLOAD_METHOD_FREEMEM = null;
0319: public static final String SEED_UPLOAD_IDLESLEEP = "40_peerseedcycle_idlesleep";
0320: public static final String SEED_UPLOAD_BUSYSLEEP = "40_peerseedcycle_busysleep";
0321:
0322: // 50_localcrawl
0323: /**
0324: * <p><code>public static final String <strong>CRAWLJOB_LOCAL_CRAWL</strong> = "50_localcrawl"</code></p>
0325: * <p>Name of the local crawler thread, popping one entry off the Local Crawl Queue, and passing it to the
0326: * proxy cache enqueue thread to download and further process it</p>
0327: *
0328: * @see plasmaSwitchboard#PROXY_CACHE_ENQUEUE
0329: */
0330: public static final String CRAWLJOB_LOCAL_CRAWL = "50_localcrawl";
0331: public static final String CRAWLJOB_LOCAL_CRAWL_METHOD_START = "coreCrawlJob";
0332: public static final String CRAWLJOB_LOCAL_CRAWL_METHOD_JOBCOUNT = "coreCrawlJobSize";
0333: public static final String CRAWLJOB_LOCAL_CRAWL_METHOD_FREEMEM = null;
0334: public static final String CRAWLJOB_LOCAL_CRAWL_IDLESLEEP = "50_localcrawl_idlesleep";
0335: public static final String CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP = "50_localcrawl_busysleep";
0336:
0337: // 61_globalcawltrigger
0338: /**
0339: * <p><code>public static final String <strong>CRAWLJOB_REMOTE_CRAWL_LOADER</strong> = "60_remotecrawlloader"</code></p>
0340: * <p>Name of the remote crawl list loading thread</p>
0341: *
0342: * @see plasmaSwitchboard#CRAWLJOB_REMOTE_CRAWL_LOADER
0343: */
0344: public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader";
0345: public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START = "remoteCrawlLoaderJob";
0346: public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT = null;
0347: public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM = null;
0348: public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP = "60_remotecrawlloader_idlesleep";
0349: public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP = "60_remotecrawlloader_busysleep";
0350:
0351: // 62_remotetriggeredcrawl
0352: /**
0353: * <p><code>public static final String <strong>CRAWLJOB_REMOTE_TRIGGERED_CRAWL</strong> = "62_remotetriggeredcrawl"</code></p>
0354: * <p>Name of the remote triggered crawl thread, responsible for processing a remote crawl received from another peer</p>
0355: */
0356: public static final String CRAWLJOB_REMOTE_TRIGGERED_CRAWL = "62_remotetriggeredcrawl";
0357: public static final String CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START = "remoteTriggeredCrawlJob";
0358: public static final String CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT = "remoteTriggeredCrawlJobSize";
0359: public static final String CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM = null;
0360: public static final String CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP = "62_remotetriggeredcrawl_idlesleep";
0361: public static final String CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP = "62_remotetriggeredcrawl_busysleep";
0362:
0363: // 70_cachemanager
0364: /**
0365: * <p><code>public static final String <strong>PROXY_CACHE_ENQUEUE</strong> = "70_cachemanager"</code></p>
0366: * <p>Name of the proxy cache enqueue thread which fetches a given website and saves the site itself as well as it's
0367: * HTTP-headers in the HTCACHE</p>
0368: *
0369: * @see plasmaSwitchboard#PROXY_CACHE_PATH
0370: */
0371: public static final String PROXY_CACHE_ENQUEUE = "70_cachemanager";
0372: public static final String PROXY_CACHE_ENQUEUE_METHOD_START = "htEntryStoreJob";
0373: public static final String PROXY_CACHE_ENQUEUE_METHOD_JOBCOUNT = "htEntrySize";
0374: public static final String PROXY_CACHE_ENQUEUE_METHOD_FREEMEM = null;
0375: public static final String PROXY_CACHE_ENQUEUE_IDLESLEEP = "70_cachemanager_idlesleep";
0376: public static final String PROXY_CACHE_ENQUEUE_BUSYSLEEP = "70_cachemanager_busysleep";
0377:
0378: // 80_indexing
0379: /**
0380: * <p><code>public static final String <strong>INDEXER</strong> = "80_indexing"</code></p>
0381: * <p>Name of the indexer thread, performing the actual indexing of a website</p>
0382: */
0383: public static final String INDEXER = "80_indexing";
0384: public static final String INDEXER_CLUSTER = "80_indexing_cluster";
0385: public static final String INDEXER_MEMPREREQ = "80_indexing_memprereq";
0386: public static final String INDEXER_IDLESLEEP = "80_indexing_idlesleep";
0387: public static final String INDEXER_BUSYSLEEP = "80_indexing_busysleep";
0388: public static final String INDEXER_METHOD_START = "deQueue";
0389: public static final String INDEXER_METHOD_JOBCOUNT = "queueSize";
0390: public static final String INDEXER_METHOD_FREEMEM = "deQueueFreeMem";
0391: public static final String INDEXER_SLOTS = "indexer.slots";
0392:
0393: // 82_crawlstack
0394: /**
0395: * <p><code>public static final String <strong>CRAWLSTACK</strong> = "82_crawlstack"</code></p>
0396: * <p>Name of the crawl stacker thread, performing several checks on new URLs to crawl, i.e. double-check</p>
0397: */
0398: public static final String CRAWLSTACK = "82_crawlstack";
0399: public static final String CRAWLSTACK_METHOD_START = "job";
0400: public static final String CRAWLSTACK_METHOD_JOBCOUNT = "size";
0401: public static final String CRAWLSTACK_METHOD_FREEMEM = null;
0402: public static final String CRAWLSTACK_IDLESLEEP = "82_crawlstack_idlesleep";
0403: public static final String CRAWLSTACK_BUSYSLEEP = "82_crawlstack_busysleep";
0404: public static final String CRAWLSTACK_SLOTS = "stacker.slots";
0405:
0406: // 90_cleanup
0407: /**
0408: * <p><code>public static final String <strong>CLEANUP</strong> = "90_cleanup"</code></p>
0409: * <p>The cleanup thread which is responsible for pendant cleanup-jobs, news/ranking distribution, etc.</p>
0410: */
0411: public static final String CLEANUP = "90_cleanup";
0412: public static final String CLEANUP_METHOD_START = "cleanupJob";
0413: public static final String CLEANUP_METHOD_JOBCOUNT = "cleanupJobSize";
0414: public static final String CLEANUP_METHOD_FREEMEM = null;
0415: public static final String CLEANUP_IDLESLEEP = "90_cleanup_idlesleep";
0416: public static final String CLEANUP_BUSYSLEEP = "90_cleanup_busysleep";
0417:
0418: //////////////////////////////////////////////////////////////////////////////////////////////
0419: // RAM Cache settings
0420: //////////////////////////////////////////////////////////////////////////////////////////////
0421:
0422: /**
0423: * <p><code>public static final String <strong>RAM_CACHE_LURL</strong> = "ramCacheLURL"</code></p>
0424: * <p>Name of the setting how much memory in bytes should be assigned to the Loaded URLs DB for caching purposes</p>
0425: */
0426: public static final String RAM_CACHE_LURL_TIME = "ramCacheLURL_time";
0427: /**
0428: * <p><code>public static final String <strong>RAM_CACHE_NURL</strong> = "ramCacheNURL"</code></p>
0429: * <p>Name of the setting how much memory in bytes should be assigned to the Noticed URLs DB for caching purposes</p>
0430: */
0431: public static final String RAM_CACHE_NURL_TIME = "ramCacheNURL_time";
0432: /**
0433: * <p><code>public static final String <strong>RAM_CACHE_EURL</strong> = "ramCacheEURL"</code></p>
0434: * <p>Name of the setting how much memory in bytes should be assigned to the Erroneous URLs DB for caching purposes</p>
0435: */
0436: public static final String RAM_CACHE_EURL_TIME = "ramCacheEURL_time";
0437: /**
0438: * <p><code>public static final String <strong>RAM_CACHE_RWI</strong> = "ramCacheRWI"</code></p>
0439: * <p>Name of the setting how much memory in bytes should be assigned to the RWIs DB for caching purposes</p>
0440: */
0441: public static final String RAM_CACHE_RWI_TIME = "ramCacheRWI_time";
0442: /**
0443: * <p><code>public static final String <strong>RAM_CACHE_HTTP</strong> = "ramCacheHTTP"</code></p>
0444: * <p>Name of the setting how much memory in bytes should be assigned to the HTTP Headers DB for caching purposes</p>
0445: */
0446: public static final String RAM_CACHE_HTTP_TIME = "ramCacheHTTP_time";
0447: /**
0448: * <p><code>public static final String <strong>RAM_CACHE_MESSAGE</strong> = "ramCacheMessage"</code></p>
0449: * <p>Name of the setting how much memory in bytes should be assigned to the Message DB for caching purposes</p>
0450: */
0451: public static final String RAM_CACHE_MESSAGE_TIME = "ramCacheMessage_time";
0452: /**
0453: * <p><code>public static final String <strong>RAM_CACHE_ROBOTS</strong> = "ramCacheRobots"</code></p>
0454: * <p>Name of the setting how much memory in bytes should be assigned to the robots.txts DB for caching purposes</p>
0455: */
0456: public static final String RAM_CACHE_ROBOTS_TIME = "ramCacheRobots_time";
0457: /**
0458: * <p><code>public static final String <strong>RAM_CACHE_PROFILES</strong> = "ramCacheProfiles"</code></p>
0459: * <p>Name of the setting how much memory in bytes should be assigned to the Crawl Profiles DB for caching purposes</p>
0460: */
0461: public static final String RAM_CACHE_PROFILES_TIME = "ramCacheProfiles_time";
0462: /**
0463: * <p><code>public static final String <strong>RAM_CACHE_PRE_NURL</strong> = "ramCachePreNURL"</code></p>
0464: * <p>Name of the setting how much memory in bytes should be assigned to the Pre-Noticed URLs DB for caching purposes</p>
0465: */
0466: public static final String RAM_CACHE_PRE_NURL_TIME = "ramCachePreNURL_time";
0467: /**
0468: * <p><code>public static final String <strong>RAM_CACHE_WIKI</strong> = "ramCacheWiki"</code></p>
0469: * <p>Name of the setting how much memory in bytes should be assigned to the Wiki DB for caching purposes</p>
0470: */
0471: public static final String RAM_CACHE_WIKI_TIME = "ramCacheWiki_time";
0472: /**
0473: * <p><code>public static final String <strong>RAM_CACHE_BLOG</strong> = "ramCacheBlog"</code></p>
0474: * <p>Name of the setting how much memory in bytes should be assigned to the Blog DB for caching purposes</p>
0475: */
0476: public static final String RAM_CACHE_BLOG_TIME = "ramCacheBlog_time";
0477:
0478: //////////////////////////////////////////////////////////////////////////////////////////////
0479: // DHT settings
0480: //////////////////////////////////////////////////////////////////////////////////////////////
0481:
0482: /**
0483: * <p><code>public static final String <strong>INDEX_DIST_DHT_RECEIPT_LIMIT</strong> = "indexDistribution.dhtReceiptLimit"</code></p>
0484: * <p>Name of the setting how many words the DHT-In cache may contain maximal before new DHT receipts
0485: * will be rejected</p>
0486: */
0487: public static final String INDEX_DIST_DHT_RECEIPT_LIMIT = "indexDistribution.dhtReceiptLimit";
0488: /**
0489: * <p><code>public static final String <strong>INDEX_DIST_CHUNK_SIZE_START</strong> = "indexDistribution.startChunkSize"</code></p>
0490: * <p>Name of the setting specifying how many words the very first chunk will contain when the DHT-thread starts</p>
0491: */
0492: public static final String INDEX_DIST_CHUNK_SIZE_START = "indexDistribution.startChunkSize";
0493: /**
0494: * <p><code>public static final String <strong>INDEX_DIST_CHUNK_SIZE_MIN</strong> = "indexDistribution.minChunkSize"</code></p>
0495: * <p>Name of the setting specifying how many words the smallest chunk may contain</p>
0496: */
0497: public static final String INDEX_DIST_CHUNK_SIZE_MIN = "indexDistribution.minChunkSize";
0498: /**
0499: * <p><code>public static final String <strong>INDEX_DIST_CHUNK_SIZE_MAX</strong> = "indexDistribution.maxChunkSize"</code></p>
0500: * <p>Name of the setting specifying how many words the hugest chunk may contain</p>
0501: */
0502: public static final String INDEX_DIST_CHUNK_SIZE_MAX = "indexDistribution.maxChunkSize";
0503: public static final String INDEX_DIST_CHUNK_FAILS_MAX = "indexDistribution.maxChunkFails";
0504: /**
0505: * <p><code>public static final String <strong>INDEX_DIST_TIMEOUT</strong> = "indexDistribution.timeout"</code></p>
0506: * <p>Name of the setting how long the timeout for an Index Distribution shall be in milliseconds</p>
0507: */
0508: public static final String INDEX_DIST_TIMEOUT = "indexDistribution.timeout";
0509: /**
0510: * <p><code>public static final String <strong>INDEX_DIST_GZIP_BODY</strong> = "indexDistribution.gzipBody"</code></p>
0511: * <p>Name of the setting whether DHT chunks shall be transferred gzip-encodedly</p>
0512: */
0513: public static final String INDEX_DIST_GZIP_BODY = "indexDistribution.gzipBody";
0514: /**
0515: * <p><code>public static final String <strong>INDEX_DIST_ALLOW</strong> = "allowDistributeIndex"</code></p>
0516: * <p>Name of the setting whether Index Distribution shall be allowed (and the DHT-thread therefore started) or not</p>
0517: *
0518: * @see plasmaSwitchboard#INDEX_DIST_ALLOW_WHILE_CRAWLING
0519: */
0520: public static final String INDEX_DIST_ALLOW = "allowDistributeIndex";
0521: public static final String INDEX_RECEIVE_ALLOW = "allowReceiveIndex";
0522: /**
0523: * <p><code>public static final String <strong>INDEX_DIST_ALLOW_WHILE_CRAWLING</strong> = "allowDistributeIndexWhileCrawling"</code></p>
0524: * <p>Name of the setting whether Index Distribution shall be allowed while crawling is in progress, i.e.
0525: * the Local Crawler Queue is filled.</p>
0526: * <p>This setting only has effect if {@link #INDEX_DIST_ALLOW} is enabled</p>
0527: *
0528: * @see plasmaSwitchboard#INDEX_DIST_ALLOW
0529: */
0530: public static final String INDEX_DIST_ALLOW_WHILE_CRAWLING = "allowDistributeIndexWhileCrawling";
0531: public static final String INDEX_DIST_ALLOW_WHILE_INDEXING = "allowDistributeIndexWhileIndexing";
0532: public static final String INDEX_TRANSFER_TIMEOUT = "indexTransfer.timeout";
0533: public static final String INDEX_TRANSFER_GZIP_BODY = "indexTransfer.gzipBody";
0534:
0535: //////////////////////////////////////////////////////////////////////////////////////////////
0536: // Ranking settings
0537: //////////////////////////////////////////////////////////////////////////////////////////////
0538:
0539: public static final String RANKING_DIST_ON = "CRDistOn";
0540: public static final String RANKING_DIST_0_PATH = "CRDist0Path";
0541: public static final String RANKING_DIST_0_METHOD = "CRDist0Method";
0542: public static final String RANKING_DIST_0_PERCENT = "CRDist0Percent";
0543: public static final String RANKING_DIST_0_TARGET = "CRDist0Target";
0544: public static final String RANKING_DIST_1_PATH = "CRDist1Path";
0545: public static final String RANKING_DIST_1_METHOD = "CRDist1Method";
0546: public static final String RANKING_DIST_1_PERCENT = "CRDist1Percent";
0547: public static final String RANKING_DIST_1_TARGET = "CRDist1Target";
0548:
0549: //////////////////////////////////////////////////////////////////////////////////////////////
0550: // Parser settings
0551: //////////////////////////////////////////////////////////////////////////////////////////////
0552:
0553: public static final String PARSER_MIMETYPES_HTML = "parseableMimeTypes.HTML";
0554: public static final String PARSER_MIMETYPES_PROXY = "parseableMimeTypes.PROXY";
0555: public static final String PARSER_MIMETYPES_CRAWLER = "parseableMimeTypes.CRAWLER";
0556: public static final String PARSER_MIMETYPES_ICAP = "parseableMimeTypes.ICAP";
0557: public static final String PARSER_MIMETYPES_URLREDIRECTOR = "parseableMimeTypes.URLREDIRECTOR";
0558: public static final String PARSER_MIMETYPES_IMAGE = "parseableMimeTypes.IMAGE";
0559: public static final String PARSER_MEDIA_EXT = "mediaExt";
0560: public static final String PARSER_MEDIA_EXT_PARSEABLE = "parseableExt";
0561:
0562: //////////////////////////////////////////////////////////////////////////////////////////////
0563: // Proxy settings
0564: //////////////////////////////////////////////////////////////////////////////////////////////
0565:
0566: /**
0567: * <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
0568: * <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>
0569: */
0570: public static final String PROXY_ONLINE_CAUTION_DELAY = "crawlPause.proxy";
0571: public static final String LOCALSEACH_ONLINE_CAUTION_DELAY = "crawlPause.localsearch";
0572: public static final String REMOTESEARCH_ONLINE_CAUTION_DELAY = "crawlPause.remotesearch";
0573: /**
0574: * <p><code>public static final String <strong>PROXY_PREFETCH_DEPTH</strong> = "proxyPrefetchDepth"</code></p>
0575: * <p>Name of the setting how deep URLs fetched by proxy usage shall be followed</p>
0576: */
0577: public static final String PROXY_PREFETCH_DEPTH = "proxyPrefetchDepth";
0578: public static final String PROXY_CRAWL_ORDER = "proxyCrawlOrder";
0579:
0580: public static final String PROXY_INDEXING_REMOTE = "proxyIndexingRemote";
0581: public static final String PROXY_INDEXING_LOCAL_TEXT = "proxyIndexingLocalText";
0582: public static final String PROXY_INDEXING_LOCAL_MEDIA = "proxyIndexingLocalMedia";
0583:
0584: public static final String PROXY_CACHE_SIZE = "proxyCacheSize";
0585: /**
0586: * <p><code>public static final String <strong>PROXY_CACHE_LAYOUT</strong> = "proxyCacheLayout"</code></p>
0587: * <p>Name of the setting which file-/folder-layout the proxy cache shall use. Possible values are {@link #PROXY_CACHE_LAYOUT_TREE}
0588: * and {@link #PROXY_CACHE_LAYOUT_HASH}</p>
0589: *
0590: * @see plasmaSwitchboard#PROXY_CACHE_LAYOUT_TREE
0591: * @see plasmaSwitchboard#PROXY_CACHE_LAYOUT_HASH
0592: */
0593: public static final String PROXY_CACHE_LAYOUT = "proxyCacheLayout";
0594: /**
0595: * <p><code>public static final String <strong>PROXY_CACHE_LAYOUT_TREE</strong> = "tree"</code></p>
0596: * <p>Setting the file-/folder-structure for {@link #PROXY_CACHE_LAYOUT}. Websites are stored in a folder-layout
0597: * according to the layout, the URL purported. The first folder is either <code>http</code> or <code>https</code>
0598: * depending on the protocol used to fetch the website, descending follows the hostname and the sub-folders on the
0599: * website up to the actual file itself.</p>
0600: * <p>When using <code>tree</code>, be aware that
0601: * the possibility of inconsistencies between folders and files with the same name may occur which prevent proper
0602: * storage of the fetched site. Below is an example how files are stored:</p>
0603: * <pre>
0604: * /html/
0605: * /html/www.example.com/
0606: * /html/www.example.com/index/
0607: * /html/www.example.com/index/en/
0608: * /html/www.example.com/index/en/index.html</pre>
0609: */
0610: public static final String PROXY_CACHE_LAYOUT_TREE = "tree";
0611: /**
0612: * <p><code>public static final String <strong>PROXY_CACHE_LAYOUT_HASH</strong> = "hash"</code></p>
0613: * <p>Setting the file-/folder-structure for {@link #PROXY_CACHE_LAYOUT}. Websites are stored using the MD5-sum of
0614: * their respective URLs. This method prevents collisions on some websites caused by using the {@link #PROXY_CACHE_LAYOUT_TREE}
0615: * layout.</p>
0616: * <p>Similarly to {@link #PROXY_CACHE_LAYOUT_TREE}, the top-folders name is given by the protocol used to fetch the site,
0617: * followed by either <code>www</code> or – if the hostname does not start with "www" – <code>other</code>.
0618: * Afterwards the next folder has the rest of the hostname as name, followed by a folder <code>hash</code> which contains
0619: * a folder consisting of the first two letters of the hash. Another folder named after the 3rd and 4th letters of the
0620: * hash follows which finally contains the file named after the full 18-characters long hash.
0621: * Below is an example how files are stored:</p>
0622: * <pre>
0623: * /html/
0624: * /html/www/
0625: * /html/www/example.com/
0626: * /html/www/example.com/hash/
0627: * /html/www/example.com/hash/0d/
0628: * /html/www/example.com/hash/0d/f8/
0629: * /html/www/example.com/hash/0d/f8/0df83a8444f48317d8</pre>
0630: */
0631: public static final String PROXY_CACHE_LAYOUT_HASH = "hash";
0632: public static final String PROXY_CACHE_MIGRATION = "proxyCacheMigration";
0633:
0634: //////////////////////////////////////////////////////////////////////////////////////////////
0635: // Cluster settings
0636: //////////////////////////////////////////////////////////////////////////////////////////////
0637:
0638: public static final String CLUSTER_MODE = "cluster.mode";
0639: public static final String CLUSTER_MODE_PUBLIC_CLUSTER = "publiccluster";
0640: public static final String CLUSTER_MODE_PRIVATE_CLUSTER = "privatecluster";
0641: public static final String CLUSTER_MODE_PUBLIC_PEER = "publicpeer";
0642: public static final String CLUSTER_PEERS_IPPORT = "cluster.peers.ipport";
0643:
0644: //////////////////////////////////////////////////////////////////////////////////////////////
0645: // Miscellaneous settings
0646: //////////////////////////////////////////////////////////////////////////////////////////////
0647:
0648: public static final String CRAWL_PROFILE_PROXY = "proxy";
0649: public static final String CRAWL_PROFILE_REMOTE = "remote";
0650: public static final String CRAWL_PROFILE_SNIPPET_TEXT = "snippetText";
0651: public static final String CRAWL_PROFILE_SNIPPET_MEDIA = "snippetMedia";
0652:
0653: /**
0654: * <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
0655: * <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
0656: */
0657: public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
0658:
0659: public static final String OWN_SEED_FILE = "yacyOwnSeedFile";
0660: /**
0661: * <p><code>public static final String <strong>STORAGE_PEER_HASH</strong> = "storagePeerHash"</code></p>
0662: * <p>Name of the setting holding the Peer-Hash where indexes shall be transferred after indexing a webpage. If this setting
0663: * is empty, the Storage Peer function is disabled</p>
0664: */
0665: public static final String STORAGE_PEER_HASH = "storagePeerHash";
0666: public static final String YACY_MODE_DEBUG = "yacyDebugMode";
0667:
0668: public static final String WORDCACHE_INIT_COUNT = "wordCacheInitCount";
0669: /**
0670: * <p><code>public static final String <strong>WORDCACHE_MAX_COUNT</strong> = "wordCacheMaxCount"</code></p>
0671: * <p>Name of the setting how many words the word-cache (or DHT-Out cache) shall contain maximal. Indexing pages if the
0672: * cache has reached this limit will slow down the indexing process by flushing some of it's entries</p>
0673: */
0674: public static final String WORDCACHE_MAX_COUNT = "wordCacheMaxCount";
0675:
0676: public static final String HTTPC_NAME_CACHE_CACHING_PATTERNS_NO = "httpc.nameCacheNoCachingPatterns";
0677:
0678: public static final String ROBOTS_TXT = "httpd.robots.txt";
0679: public static final String ROBOTS_TXT_DEFAULT = httpdRobotsTxtConfig.LOCKED
0680: + "," + httpdRobotsTxtConfig.DIRS;
0681:
0682: public static final String WIKIPARSER_CLASS = "wikiParser.class";
0683: public static final String WIKIPARSER_CLASS_DEFAULT = "de.anomic.data.wikiCode";
0684:
0685: //////////////////////////////////////////////////////////////////////////////////////////////
0686: // Lists
0687: //////////////////////////////////////////////////////////////////////////////////////////////
0688:
0689: /**
0690: * <p><code>public static final String <strong>BLACKLIST_CLASS</strong> = "Blacklist.class"</code></p>
0691: * <p>Name of the setting which Blacklist backend shall be used. Due to different requirements of users, the
0692: * {@link plasmaURLPattern}-interface has been created to support blacklist engines different from YaCy's default</p>
0693: * <p>Attention is required when the backend is changed, because different engines may have different syntaxes</p>
0694: */
0695: public static final String BLACKLIST_CLASS = "BlackLists.class";
0696: /**
0697: * <p><code>public static final String <strong>BLACKLIST_CLASS_DEFAULT</strong> = "de.anomic.plasma.urlPattern.defaultURLPattern"</code></p>
0698: * <p>Package and name of YaCy's {@link defaultURLPattern default} blacklist implementation</p>
0699: *
0700: * @see defaultURLPattern for a detailed overview about the syntax of the default implementation
0701: */
0702: public static final String BLACKLIST_CLASS_DEFAULT = "de.anomic.plasma.urlPattern.defaultURLPattern";
0703:
0704: public static final String LIST_BLUE = "plasmaBlueList";
0705: public static final String LIST_BLUE_DEFAULT = null;
0706: public static final String LIST_BADWORDS_DEFAULT = "yacy.badwords";
0707: public static final String LIST_STOPWORDS_DEFAULT = "yacy.stopwords";
0708:
0709: //////////////////////////////////////////////////////////////////////////////////////////////
0710: // DB Paths
0711: //////////////////////////////////////////////////////////////////////////////////////////////
0712:
0713: /**
0714: * <p><code>public static final String <strong>DBPATH</strong> = "dbPath"</code></p>
0715: * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
0716: * databases containing queues are stored</p>
0717: */
0718: public static final String DBPATH = "dbPath";
0719: public static final String DBPATH_DEFAULT = "DATA/PLASMADB";
0720: /**
0721: * <p><code>public static final String <strong>HTCACHE_PATH</strong> = "proxyCache"</code></p>
0722: * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
0723: * downloaded webpages and their respective ressources and HTTP-headers are stored. It is the location containing
0724: * the proxy-cache</p>
0725: *
0726: * @see plasmaSwitchboard#PROXY_CACHE_LAYOUT for details on the file-layout in this path
0727: */
0728: public static final String HTCACHE_PATH = "proxyCache";
0729: public static final String HTCACHE_PATH_DEFAULT = "DATA/HTCACHE";
0730: public static final String RELEASE_PATH = "releases";
0731: public static final String RELEASE_PATH_DEFAULT = "DATA/RELEASE";
0732: /**
0733: * <p><code>public static final String <strong>HTDOCS_PATH</strong> = "htDocsPath"</code></p>
0734: * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
0735: * user-ressources (i.e. for the fileshare or the contents displayed on <code>www.peername.yacy</code>) lie.
0736: * The translated templates of the webinterface will also be put in here</p>
0737: */
0738: public static final String HTDOCS_PATH = "htDocsPath";
0739: public static final String HTDOCS_PATH_DEFAULT = "DATA/HTDOCS";
0740: /**
0741: * <p><code>public static final String <strong>HTROOT_PATH</strong> = "htRootPath"</code></p>
0742: * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
0743: * original servlets, their stylesheets, scripts, etc. lie. It is also home of the XML-interface to YaCy</p>
0744: */
0745: public static final String HTROOT_PATH = "htRootPath";
0746: public static final String HTROOT_PATH_DEFAULT = "htroot";
0747: /**
0748: * <p><code>public static final String <strong>INDEX_PATH</strong> = "indexPath"</code></p>
0749: * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where the
0750: * whole database of known RWIs and URLs as well as dumps of the DHT-In and DHT-Out caches are stored</p>
0751: */
0752: public static final String INDEX_PRIMARY_PATH = "indexPrimaryPath"; // this is a relative path to the data root
0753: public static final String INDEX_SECONDARY_PATH = "indexSecondaryPath"; // this is a absolute path to any location
0754: public static final String INDEX_PATH_DEFAULT = "DATA/INDEX";
0755: /**
0756: * <p><code>public static final String <strong>LISTS_PATH</strong> = "listsPath"</code></p>
0757: * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
0758: * user-lists like blacklists, etc. are stored</p>
0759: */
0760: public static final String LISTS_PATH = "listsPath";
0761: public static final String LISTS_PATH_DEFAULT = "DATA/LISTS";
0762: /**
0763: * <p><code>public static final String <strong>RANKING_PATH</strong> = "rankingPath"</code></p>
0764: * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
0765: * ranking files are stored, self-generated as well as received ranking files</p>
0766: *
0767: * @see plasmaSwitchboard#RANKING_DIST_0_PATH
0768: * @see plasmaSwitchboard#RANKING_DIST_1_PATH
0769: */
0770: public static final String RANKING_PATH = "rankingPath";
0771: public static final String RANKING_PATH_DEFAULT = "DATA/RANKING";
0772: /**
0773: * <p><code>public static final String <strong>WORK_PATH</strong> = "wordPath"</code></p>
0774: * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
0775: * DBs containing "work" of the user are saved. Such include bookmarks, messages, wiki, blog</p>
0776: *
0777: * @see plasmaSwitchboard#DBFILE_BLOG
0778: * @see plasmaSwitchboard#DBFILE_BOOKMARKS
0779: * @see plasmaSwitchboard#DBFILE_BOOKMARKS_DATES
0780: * @see plasmaSwitchboard#DBFILE_BOOKMARKS_TAGS
0781: * @see plasmaSwitchboard#DBFILE_MESSAGE
0782: * @see plasmaSwitchboard#DBFILE_WIKI
0783: * @see plasmaSwitchboard#DBFILE_WIKI_BKP
0784: */
0785: public static final String WORK_PATH = "workPath";
0786: public static final String WORK_PATH_DEFAULT = "DATA/WORK";
0787:
0788: //////////////////////////////////////////////////////////////////////////////////////////////
0789: // DB files
0790: //////////////////////////////////////////////////////////////////////////////////////////////
0791:
0792: /**
0793: * <p><code>public static final String <strong>DBFILE_MESSAGE</strong> = "message.db"</code></p>
0794: * <p>Name of the file containing the database holding the user's peer-messages</p>
0795: *
0796: * @see plasmaSwitchboard#WORK_PATH for the folder, this file lies in
0797: */
0798: public static final String DBFILE_MESSAGE = "message.db";
0799: /**
0800: * <p><code>public static final String <strong>DBFILE_WIKI</strong> = "wiki.db"</code></p>
0801: * <p>Name of the file containing the database holding the whole wiki of this peer</p>
0802: *
0803: * @see plasmaSwitchboard#WORK_PATH for the folder, this file lies in
0804: * @see plasmaSwitchboard#DBFILE_WIKI_BKP for the file previous versions of wiki-pages lie in
0805: */
0806: public static final String DBFILE_WIKI = "wiki.db";
0807: /**
0808: * <p><code>public static final String <strong>DBFILE_WIKI_BKP</strong> = "wiki-bkp.db"</code></p>
0809: * <p>Name of the file containing the database holding all versions but the latest of the wiki-pages of this peer</p>
0810: *
0811: * @see plasmaSwitchboard#WORK_PATH for the folder this file lies in
0812: * @see plasmaSwitchboard#DBFILE_WIKI for the file the latest version of wiki-pages lie in
0813: */
0814: public static final String DBFILE_WIKI_BKP = "wiki-bkp.db";
0815: /**
0816: * <p><code>public static final String <strong>DBFILE_BLOG</strong> = "blog.db"</code></p>
0817: * <p>Name of the file containing the database holding all blog-entries available on this peer</p>
0818: *
0819: * @see plasmaSwitchboard#WORK_PATH for the folder this file lies in
0820: */
0821: public static final String DBFILE_BLOG = "blog.db";
0822: /**
0823: * <p><code>public static final String <strong>DBFILE_BLOGCOMMENTS</strong> = "blogComment.db"</code></p>
0824: * <p>Name of the file containing the database holding all blogComment-entries available on this peer</p>
0825: *
0826: * @see plasmaSwitchboard#WORK_PATH for the folder this file lies in
0827: */
0828: public static final String DBFILE_BLOGCOMMENTS = "blogComment.db";
0829: /**
0830: * <p><code>public static final String <strong>DBFILE_BOOKMARKS</strong> = "bookmarks.db"</code></p>
0831: * <p>Name of the file containing the database holding all bookmarks available on this peer</p>
0832: *
0833: * @see plasmaSwitchboard#WORK_PATH for the folder this file lies in
0834: * @see bookmarksDB for more detailed overview about the bookmarks structure
0835: */
0836: public static final String DBFILE_BOOKMARKS = "bookmarks.db";
0837: /**
0838: * <p><code>public static final String <strong>DBFILE_BOOKMARKS_TAGS</strong> = "bookmarkTags.db"</code></p>
0839: * <p>Name of the file containing the database holding all tag->bookmark relations</p>
0840: *
0841: * @see plasmaSwitchboard#WORK_PATH for the folder this file lies in
0842: * @see bookmarksDB for more detailed overview about the bookmarks structure
0843: */
0844: public static final String DBFILE_BOOKMARKS_TAGS = "bookmarkTags.db";
0845: /**
0846: * <p><code>public static final String <strong>DBFILE_BOOKMARKS_DATES</strong> = "bookmarkDates.db"</code></p>
0847: * <p>Name of the file containing the database holding all date->bookmark relations</p>
0848: *
0849: * @see plasmaSwitchboard#WORK_PATH for the folder this file lies in
0850: * @see bookmarksDB for more detailed overview about the bookmarks structure
0851: */
0852: public static final String DBFILE_BOOKMARKS_DATES = "bookmarkDates.db";
0853: /**
0854: * <p><code>public static final String <strong>DBFILE_OWN_SEED</strong> = "mySeed.txt"</code></p>
0855: * <p>Name of the file containing the database holding this peer's seed</p>
0856: */
0857: public static final String DBFILE_OWN_SEED = "mySeed.txt";
0858: /**
0859: * <p><code>public static final String <strong>DBFILE_CRAWL_PROFILES</strong> = "crawlProfiles0.db"</code>
0860: * <p>Name of the file containing the database holding all recent crawl profiles</p>
0861: *
0862: * @see plasmaSwitchboard#DBPATH for the folder this file lies in
0863: */
0864: public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive1.db";
0865: public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.db";
0866: /**
0867: * <p><code>public static final String <strong>DBFILE_CRAWL_ROBOTS</strong> = "crawlRobotsTxt.db"</code></p>
0868: * <p>Name of the file containing the database holding all <code>robots.txt</code>-entries of the lately crawled domains</p>
0869: *
0870: * @see plasmaSwitchboard#DBPATH for the folder this file lies in
0871: */
0872: public static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt1.db";
0873: /**
0874: * <p><code>public static final String <strong>DBFILE_USER</strong> = "DATA/SETTINGS/user.db"</code></p>
0875: * <p>Path to the user-DB, beginning from the YaCy-installation's top-folder. It holds all rights the created
0876: * users have as well as all other needed data about them</p>
0877: */
0878: public static final String DBFILE_USER = "DATA/SETTINGS/user.db";
0879:
0880: public Hashtable<String, Object[]> crawlJobsStatus = new Hashtable<String, Object[]>();
0881:
0882: private static plasmaSwitchboard sb;
0883:
0884: public plasmaSwitchboard(File rootPath, String initPath,
0885: String configPath, boolean applyPro) {
0886: super (rootPath, initPath, configPath, applyPro);
0887: serverProfiling.startSystemProfiling();
0888: sb = this ;
0889:
0890: // set loglevel and log
0891: setLog(new serverLog("PLASMA"));
0892: if (applyPro)
0893: this .log.logInfo("This is the pro-version of YaCy");
0894:
0895: // remote proxy configuration
0896: this .remoteProxyConfig = httpRemoteProxyConfig.init(this );
0897: this .log.logConfig("Remote proxy configuration:\n"
0898: + this .remoteProxyConfig.toString());
0899:
0900: // load network configuration into settings
0901: String networkUnitDefinition = getConfig(
0902: "network.unit.definition", "yacy.network.unit");
0903: String networkGroupDefinition = getConfig(
0904: "network.group.definition", "yacy.network.group");
0905:
0906: // include additional network definition properties into our settings
0907: // note that these properties cannot be set in the application because they are
0908: // _always_ overwritten each time with the default values. This is done so on purpose.
0909: // the network definition should be made either consistent for all peers,
0910: // or independently using a bootstrap URL
0911: Map<String, String> initProps;
0912: if (networkUnitDefinition.startsWith("http://")) {
0913: try {
0914: this .setConfig(httpc
0915: .loadHashMap(new yacyURL(networkUnitDefinition,
0916: null), remoteProxyConfig));
0917: } catch (MalformedURLException e) {
0918: }
0919: } else {
0920: File networkUnitDefinitionFile = new File(rootPath,
0921: networkUnitDefinition);
0922: if (networkUnitDefinitionFile.exists()) {
0923: initProps = serverFileUtils
0924: .loadHashMap(networkUnitDefinitionFile);
0925: this .setConfig(initProps);
0926: }
0927: }
0928: if (networkGroupDefinition.startsWith("http://")) {
0929: try {
0930: this .setConfig(httpc.loadHashMap(new yacyURL(
0931: networkGroupDefinition, null),
0932: remoteProxyConfig));
0933: } catch (MalformedURLException e) {
0934: }
0935: } else {
0936: File networkGroupDefinitionFile = new File(rootPath,
0937: networkGroupDefinition);
0938: if (networkGroupDefinitionFile.exists()) {
0939: initProps = serverFileUtils
0940: .loadHashMap(networkGroupDefinitionFile);
0941: this .setConfig(initProps);
0942: }
0943: }
0944:
0945: // set release locations
0946: int i = 0;
0947: String location;
0948: while (true) {
0949: location = getConfig("network.unit.update.location" + i, "");
0950: if (location.length() == 0)
0951: break;
0952: try {
0953: yacyVersion.latestReleaseLocations.add(new yacyURL(
0954: location, null));
0955: } catch (MalformedURLException e) {
0956: break;
0957: }
0958: i++;
0959: }
0960:
0961: // initiate url license object
0962: licensedURLs = new URLLicense(8);
0963:
0964: // set URL domain acceptance
0965: this .acceptGlobalURLs = "global.any".indexOf(getConfig(
0966: "network.unit.domain", "global")) >= 0;
0967: this .acceptLocalURLs = "local.any".indexOf(getConfig(
0968: "network.unit.domain", "global")) >= 0;
0969:
0970: // start yacy core
0971: log.logConfig("Starting YaCy Protocol Core");
0972: this .yc = new yacyCore(this );
0973: serverInstantThread.oneTimeJob(yacyCore.peerActions,
0974: "loadSeedLists", yacyCore.log, 0);
0975: long startedSeedListAquisition = System.currentTimeMillis();
0976:
0977: // load values from configs
0978: this .plasmaPath = getConfigPath(DBPATH, DBPATH_DEFAULT);
0979: this .log.logConfig("Plasma DB Path: "
0980: + this .plasmaPath.toString());
0981: this .indexPrimaryPath = getConfigPath(INDEX_PRIMARY_PATH,
0982: INDEX_PATH_DEFAULT);
0983: this .log.logConfig("Index Primary Path: "
0984: + this .indexPrimaryPath.toString());
0985: this .indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "")
0986: .length() == 0) ? indexPrimaryPath : new File(
0987: getConfig(INDEX_SECONDARY_PATH, ""));
0988: this .log.logConfig("Index Secondary Path: "
0989: + this .indexSecondaryPath.toString());
0990: this .listsPath = getConfigPath(LISTS_PATH, LISTS_PATH_DEFAULT);
0991: this .log.logConfig("Lists Path: "
0992: + this .listsPath.toString());
0993: this .htDocsPath = getConfigPath(HTDOCS_PATH,
0994: HTDOCS_PATH_DEFAULT);
0995: this .log.logConfig("HTDOCS Path: "
0996: + this .htDocsPath.toString());
0997: this .rankingPath = getConfigPath(RANKING_PATH,
0998: RANKING_PATH_DEFAULT);
0999: this .log.logConfig("Ranking Path: "
1000: + this .rankingPath.toString());
1001: this .rankingPermissions = new HashMap<String, String>(); // mapping of permission - to filename.
1002: this .workPath = getConfigPath(WORK_PATH, WORK_PATH_DEFAULT);
1003: this .log.logConfig("Work Path: " + this .workPath.toString());
1004:
1005: // set up local robots.txt
1006: this .robotstxtConfig = httpdRobotsTxtConfig.init(this );
1007:
1008: // setting timestamp of last proxy access
1009: this .proxyLastAccess = System.currentTimeMillis() - 60000;
1010: this .localSearchLastAccess = System.currentTimeMillis() - 60000;
1011: this .remoteSearchLastAccess = System.currentTimeMillis() - 60000;
1012: this .webStructure = new plasmaWebStructure(log, rankingPath,
1013: "LOCAL/010_cr/", getConfig("CRDist0Path",
1014: plasmaRankingDistribution.CR_OWN), new File(
1015: plasmaPath, "webStructure.map"));
1016:
1017: // configuring list path
1018: if (!(listsPath.exists()))
1019: listsPath.mkdirs();
1020:
1021: // load coloured lists
1022: if (blueList == null) {
1023: // read only once upon first instantiation of this class
1024: String f = getConfig(LIST_BLUE, LIST_BLUE_DEFAULT);
1025: File plasmaBlueListFile = new File(f);
1026: if (f != null)
1027: blueList = kelondroMSetTools.loadList(
1028: plasmaBlueListFile,
1029: kelondroNaturalOrder.naturalComparator);
1030: else
1031: blueList = new TreeSet<String>();
1032: this .log.logConfig("loaded blue-list from file "
1033: + plasmaBlueListFile.getName() + ", "
1034: + blueList.size() + " entries, "
1035: + ppRamString(plasmaBlueListFile.length() / 1024));
1036: }
1037:
1038: // load the black-list / inspired by [AS]
1039: File blacklistsPath = getConfigPath(LISTS_PATH,
1040: LISTS_PATH_DEFAULT);
1041: String blacklistClassName = getConfig(BLACKLIST_CLASS,
1042: BLACKLIST_CLASS_DEFAULT);
1043:
1044: this .log.logConfig("Starting blacklist engine ...");
1045: try {
1046: Class<?> blacklistClass = Class.forName(blacklistClassName);
1047: Constructor<?> blacklistClassConstr = blacklistClass
1048: .getConstructor(new Class[] { File.class });
1049: urlBlacklist = (plasmaURLPattern) blacklistClassConstr
1050: .newInstance(new Object[] { blacklistsPath });
1051: this .log.logFine("Used blacklist engine class: "
1052: + blacklistClassName);
1053: this .log.logConfig("Using blacklist engine: "
1054: + urlBlacklist.getEngineInfo());
1055: } catch (Exception e) {
1056: this .log
1057: .logSevere("Unable to load the blacklist engine", e);
1058: System.exit(-1);
1059: } catch (Error e) {
1060: this .log
1061: .logSevere("Unable to load the blacklist engine", e);
1062: System.exit(-1);
1063: }
1064:
1065: this .log.logConfig("Loading backlist data ...");
1066: listManager.switchboard = this ;
1067: listManager.listsPath = blacklistsPath;
1068: listManager.reloadBlacklists();
1069:
1070: // load badwords (to filter the topwords)
1071: if (badwords == null) {
1072: File badwordsFile = new File(rootPath,
1073: LIST_BADWORDS_DEFAULT);
1074: badwords = kelondroMSetTools.loadList(badwordsFile,
1075: kelondroNaturalOrder.naturalComparator);
1076: this .log.logConfig("loaded badwords from file "
1077: + badwordsFile.getName() + ", " + badwords.size()
1078: + " entries, "
1079: + ppRamString(badwordsFile.length() / 1024));
1080: }
1081:
1082: // load stopwords
1083: if (stopwords == null) {
1084: File stopwordsFile = new File(rootPath,
1085: LIST_STOPWORDS_DEFAULT);
1086: stopwords = kelondroMSetTools.loadList(stopwordsFile,
1087: kelondroNaturalOrder.naturalComparator);
1088: this .log.logConfig("loaded stopwords from file "
1089: + stopwordsFile.getName() + ", " + stopwords.size()
1090: + " entries, "
1091: + ppRamString(stopwordsFile.length() / 1024));
1092: }
1093:
1094: // load ranking tables
1095: File YBRPath = new File(rootPath, "ranking/YBR");
1096: if (YBRPath.exists()) {
1097: plasmaSearchRankingProcess.loadYBR(YBRPath, 15);
1098: }
1099:
1100: // read memory amount
1101: long ramLURL_time = getConfigLong(RAM_CACHE_LURL_TIME, 1000);
1102: long ramNURL_time = getConfigLong(RAM_CACHE_NURL_TIME, 1000);
1103: long ramEURL_time = getConfigLong(RAM_CACHE_EURL_TIME, 1000);
1104: long ramRWI_time = getConfigLong(RAM_CACHE_RWI_TIME, 1000);
1105: long ramHTTP_time = getConfigLong(RAM_CACHE_HTTP_TIME, 1000);
1106: long ramMessage_time = getConfigLong(RAM_CACHE_MESSAGE_TIME,
1107: 1000);
1108: long ramRobots_time = getConfigLong(RAM_CACHE_ROBOTS_TIME, 1000);
1109: long ramProfiles_time = getConfigLong(RAM_CACHE_PROFILES_TIME,
1110: 1000);
1111: long ramPreNURL_time = getConfigLong(RAM_CACHE_PRE_NURL_TIME,
1112: 1000);
1113: long ramWiki_time = getConfigLong(RAM_CACHE_WIKI_TIME, 1000);
1114: long ramBlog_time = getConfigLong(RAM_CACHE_BLOG_TIME, 1000);
1115: this .log.logConfig("LURL preloadTime = " + ramLURL_time);
1116: this .log.logConfig("NURL preloadTime = " + ramNURL_time);
1117: this .log.logConfig("EURL preloadTime = " + ramEURL_time);
1118: this .log.logConfig("RWI preloadTime = " + ramRWI_time);
1119: this .log.logConfig("HTTP preloadTime = " + ramHTTP_time);
1120: this .log.logConfig("Message preloadTime = " + ramMessage_time);
1121: this .log.logConfig("Wiki preloadTime = " + ramWiki_time);
1122: this .log.logConfig("Blog preloadTime = " + ramBlog_time);
1123: this .log.logConfig("Robots preloadTime = " + ramRobots_time);
1124: this .log
1125: .logConfig("Profiles preloadTime = " + ramProfiles_time);
1126: this .log.logConfig("PreNURL preloadTime = " + ramPreNURL_time);
1127:
1128: // make crawl profiles database and default profiles
1129: this .log.logConfig("Initializing Crawl Profiles");
1130: File profilesActiveFile = new File(this .plasmaPath,
1131: DBFILE_ACTIVE_CRAWL_PROFILES);
1132: this .profilesActiveCrawls = new plasmaCrawlProfile(
1133: profilesActiveFile, ramProfiles_time);
1134: initActiveCrawlProfiles();
1135: log.logConfig("Loaded active crawl profiles from file "
1136: + profilesActiveFile.getName() + ", "
1137: + this .profilesActiveCrawls.size() + " entries" + ", "
1138: + ppRamString(profilesActiveFile.length() / 1024));
1139: File profilesPassiveFile = new File(this .plasmaPath,
1140: DBFILE_PASSIVE_CRAWL_PROFILES);
1141: this .profilesPassiveCrawls = new plasmaCrawlProfile(
1142: profilesPassiveFile, ramProfiles_time);
1143: log.logConfig("Loaded passive crawl profiles from file "
1144: + profilesPassiveFile.getName() + ", "
1145: + this .profilesPassiveCrawls.size() + " entries" + ", "
1146: + ppRamString(profilesPassiveFile.length() / 1024));
1147:
1148: // loading the robots.txt db
1149: this .log.logConfig("Initializing robots.txt DB");
1150: File robotsDBFile = new File(this .plasmaPath,
1151: DBFILE_CRAWL_ROBOTS);
1152: robots = new plasmaCrawlRobotsTxt(robotsDBFile, ramRobots_time);
1153: this .log.logConfig("Loaded robots.txt DB from file "
1154: + robotsDBFile.getName() + ", " + robots.size()
1155: + " entries" + ", "
1156: + ppRamString(robotsDBFile.length() / 1024));
1157:
1158: // start a cache manager
1159: log.logConfig("Starting HT Cache Manager");
1160:
1161: // create the cache directory
1162: htCachePath = getConfigPath(HTCACHE_PATH, HTCACHE_PATH_DEFAULT);
1163: this .log.logInfo("HTCACHE Path = "
1164: + htCachePath.getAbsolutePath());
1165: long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig(
1166: PROXY_CACHE_SIZE, "2")); // this is megabyte
1167: String cacheLayout = getConfig(PROXY_CACHE_LAYOUT,
1168: PROXY_CACHE_LAYOUT_TREE);
1169: boolean cacheMigration = getConfigBool(PROXY_CACHE_MIGRATION,
1170: true);
1171: plasmaHTCache.init(htCachePath, maxCacheSize, ramHTTP_time,
1172: cacheLayout, cacheMigration);
1173:
1174: // create the release download directory
1175: releasePath = getConfigPath(RELEASE_PATH, RELEASE_PATH_DEFAULT);
1176: releasePath.mkdirs();
1177: this .log.logInfo("RELEASE Path = "
1178: + releasePath.getAbsolutePath());
1179:
1180: // starting message board
1181: initMessages(ramMessage_time);
1182:
1183: // starting wiki
1184: initWiki(ramWiki_time);
1185:
1186: //starting blog
1187: initBlog(ramBlog_time);
1188:
1189: // Init User DB
1190: this .log.logConfig("Loading User DB");
1191: File userDbFile = new File(getRootPath(), DBFILE_USER);
1192: this .userDB = new userDB(userDbFile, 2000);
1193: this .log.logConfig("Loaded User DB from file "
1194: + userDbFile.getName() + ", " + this .userDB.size()
1195: + " entries" + ", "
1196: + ppRamString(userDbFile.length() / 1024));
1197:
1198: //Init bookmarks DB
1199: initBookmarks();
1200:
1201: // start indexing management
1202: log.logConfig("Starting Indexing Management");
1203: wordIndex = new plasmaWordIndex(indexPrimaryPath,
1204: indexSecondaryPath, ramRWI_time, log);
1205:
1206: // set a high maximum cache size to current size; this is adopted later automatically
1207: int wordCacheMaxCount = Math.max((int) getConfigLong(
1208: WORDCACHE_INIT_COUNT, 30000), (int) getConfigLong(
1209: WORDCACHE_MAX_COUNT, 20000));
1210: setConfig(WORDCACHE_MAX_COUNT, Integer
1211: .toString(wordCacheMaxCount));
1212: wordIndex.setMaxWordCount(wordCacheMaxCount);
1213: wordIndex.setWordFlushSize((int) getConfigLong("wordFlushSize",
1214: 10000));
1215:
1216: // set a maximum amount of memory for the caches
1217: // long memprereq = Math.max(getConfigLong(INDEXER_MEMPREREQ, 0), wordIndex.minMem());
1218: // setConfig(INDEXER_MEMPREREQ, memprereq);
1219: // setThreadPerformance(INDEXER, getConfigLong(INDEXER_IDLESLEEP, 0), getConfigLong(INDEXER_BUSYSLEEP, 0), memprereq);
1220: kelondroCachedRecords.setCacheGrowStati(40 * 1024 * 1024,
1221: 20 * 1024 * 1024);
1222: kelondroCache.setCacheGrowStati(40 * 1024 * 1024,
1223: 20 * 1024 * 1024);
1224:
1225: // make parser
1226: log.logConfig("Starting Parser");
1227: this .parser = new plasmaParser();
1228:
1229: /* ======================================================================
1230: * initialize switchboard queue
1231: * ====================================================================== */
1232: // create queue
1233: this .sbQueue = new plasmaSwitchboardQueue(
1234: this .wordIndex.loadedURL, new File(this .plasmaPath,
1235: "switchboardQueue2.stack"),
1236: this .profilesActiveCrawls);
1237:
1238: // create in process list
1239: this .indexingTasksInProcess = new HashMap<String, plasmaSwitchboardQueue.Entry>();
1240:
1241: // going through the sbQueue Entries and registering all content files as in use
1242: int count = 0;
1243: plasmaSwitchboardQueue.Entry queueEntry;
1244: Iterator<plasmaSwitchboardQueue.Entry> i1 = sbQueue
1245: .entryIterator(true);
1246: while (i1.hasNext()) {
1247: queueEntry = i1.next();
1248: if ((queueEntry != null) && (queueEntry.url() != null)
1249: && (queueEntry.cacheFile().exists())) {
1250: plasmaHTCache.filesInUse.add(queueEntry.cacheFile());
1251: count++;
1252: }
1253: }
1254: this .log
1255: .logConfig(count
1256: + " files in htcache reported to the cachemanager as in use.");
1257:
1258: // define an extension-blacklist
1259: log
1260: .logConfig("Parser: Initializing Extension Mappings for Media/Parser");
1261: plasmaParser.initMediaExt(plasmaParser
1262: .extString2extList(getConfig(PARSER_MEDIA_EXT, "")));
1263: plasmaParser.initSupportedHTMLFileExt(plasmaParser
1264: .extString2extList(getConfig(
1265: PARSER_MEDIA_EXT_PARSEABLE, "")));
1266:
1267: // define a realtime parsable mimetype list
1268: log.logConfig("Parser: Initializing Mime Types");
1269: plasmaParser.initHTMLParsableMimeTypes(getConfig(
1270: PARSER_MIMETYPES_HTML,
1271: "application/xhtml+xml,text/html,text/plain"));
1272: plasmaParser.initParseableMimeTypes(
1273: plasmaParser.PARSER_MODE_PROXY, getConfig(
1274: PARSER_MIMETYPES_PROXY, null));
1275: plasmaParser.initParseableMimeTypes(
1276: plasmaParser.PARSER_MODE_CRAWLER, getConfig(
1277: PARSER_MIMETYPES_CRAWLER, null));
1278: plasmaParser.initParseableMimeTypes(
1279: plasmaParser.PARSER_MODE_ICAP, getConfig(
1280: PARSER_MIMETYPES_ICAP, null));
1281: plasmaParser.initParseableMimeTypes(
1282: plasmaParser.PARSER_MODE_URLREDIRECTOR, getConfig(
1283: PARSER_MIMETYPES_URLREDIRECTOR, null));
1284: plasmaParser.initParseableMimeTypes(
1285: plasmaParser.PARSER_MODE_IMAGE, getConfig(
1286: PARSER_MIMETYPES_IMAGE, null));
1287:
1288: // start a loader
1289: log.logConfig("Starting Crawl Loader");
1290: this .crawlQueues = new plasmaCrawlQueues(this , plasmaPath);
1291:
1292: /*
1293: * Creating sync objects and loading status for the crawl jobs
1294: * a) local crawl
1295: * b) remote triggered crawl
1296: * c) global crawl trigger
1297: */
1298: this .crawlJobsStatus.put(CRAWLJOB_LOCAL_CRAWL, new Object[] {
1299: new Object(),
1300: Boolean.valueOf(getConfig(CRAWLJOB_LOCAL_CRAWL
1301: + "_isPaused", "false")) });
1302: this .crawlJobsStatus.put(CRAWLJOB_REMOTE_TRIGGERED_CRAWL,
1303: new Object[] {
1304: new Object(),
1305: Boolean.valueOf(getConfig(
1306: CRAWLJOB_REMOTE_TRIGGERED_CRAWL
1307: + "_isPaused", "false")) });
1308: this .crawlJobsStatus.put(CRAWLJOB_REMOTE_CRAWL_LOADER,
1309: new Object[] {
1310: new Object(),
1311: Boolean.valueOf(getConfig(
1312: CRAWLJOB_REMOTE_CRAWL_LOADER
1313: + "_isPaused", "false")) });
1314:
1315: // init cookie-Monitor
1316: this .log.logConfig("Starting Cookie Monitor");
1317: this .outgoingCookies = new HashMap<String, Object[]>();
1318: this .incomingCookies = new HashMap<String, Object[]>();
1319:
1320: // init search history trackers
1321: this .localSearchTracker = new HashMap<String, TreeSet<Long>>(); // String:TreeSet - IP:set of Long(accessTime)
1322: this .remoteSearchTracker = new HashMap<String, TreeSet<Long>>();
1323: this .localSearches = new ArrayList<HashMap<String, Object>>(); // contains search result properties as HashMaps
1324: this .remoteSearches = new ArrayList<HashMap<String, Object>>();
1325:
1326: // init messages: clean up message symbol
1327: File notifierSource = new File(getRootPath(), getConfig(
1328: HTROOT_PATH, HTROOT_PATH_DEFAULT)
1329: + "/env/grafics/empty.gif");
1330: File notifierDest = new File(getConfigPath(HTDOCS_PATH,
1331: HTDOCS_PATH_DEFAULT), "notifier.gif");
1332: try {
1333: serverFileUtils.copy(notifierSource, notifierDest);
1334: } catch (IOException e) {
1335: }
1336:
1337: // clean up profiles
1338: this .log.logConfig("Cleaning Profiles");
1339: try {
1340: cleanProfiles();
1341: } catch (InterruptedException e) { /* Ignore this here */
1342: }
1343:
1344: // init ranking transmission
1345: /*
1346: CRDistOn = true/false
1347: CRDist0Path = GLOBAL/010_owncr
1348: CRDist0Method = 1
1349: CRDist0Percent = 0
1350: CRDist0Target =
1351: CRDist1Path = GLOBAL/014_othercr/1
1352: CRDist1Method = 9
1353: CRDist1Percent = 30
1354: CRDist1Target = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080
1355: **/
1356: rankingOn = getConfig(RANKING_DIST_ON, "true").equals("true")
1357: && getConfig("network.unit.name", "").equals(
1358: "freeworld");
1359: rankingOwnDistribution = new plasmaRankingDistribution(log,
1360: new File(rankingPath, getConfig(RANKING_DIST_0_PATH,
1361: plasmaRankingDistribution.CR_OWN)),
1362: (int) getConfigLong(RANKING_DIST_0_METHOD,
1363: plasmaRankingDistribution.METHOD_ANYSENIOR),
1364: (int) getConfigLong(RANKING_DIST_0_METHOD, 0),
1365: getConfig(RANKING_DIST_0_TARGET, ""));
1366: rankingOtherDistribution = new plasmaRankingDistribution(
1367: log,
1368: new File(rankingPath, getConfig(RANKING_DIST_1_PATH,
1369: plasmaRankingDistribution.CR_OTHER)),
1370: (int) getConfigLong(RANKING_DIST_1_METHOD,
1371: plasmaRankingDistribution.METHOD_MIXEDSENIOR),
1372: (int) getConfigLong(RANKING_DIST_1_METHOD, 30),
1373: getConfig(RANKING_DIST_1_TARGET,
1374: "kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080"));
1375:
1376: // init facility DB
1377: /*
1378: log.logSystem("Starting Facility Database");
1379: File facilityDBpath = new File(getRootPath(), "DATA/SETTINGS/");
1380: facilityDB = new kelondroTables(facilityDBpath);
1381: facilityDB.declareMaps("backlinks", 250, 500, new String[] {"date"}, null);
1382: log.logSystem("..opened backlinks");
1383: facilityDB.declareMaps("zeitgeist", 40, 500);
1384: log.logSystem("..opened zeitgeist");
1385: facilityDB.declareTree("statistik", new int[]{11, 8, 8, 8, 8, 8, 8}, 0x400);
1386: log.logSystem("..opened statistik");
1387: facilityDB.update("statistik", (new serverDate()).toShortString(false).substring(0, 11), new long[]{1,2,3,4,5,6});
1388: long[] testresult = facilityDB.selectLong("statistik", "yyyyMMddHHm");
1389: testresult = facilityDB.selectLong("statistik", (new serverDate()).toShortString(false).substring(0, 11));
1390: */
1391:
1392: /*
1393: * Initializing httpc
1394: */
1395: // initializing yacyDebugMode
1396: httpc.yacyDebugMode = getConfig(YACY_MODE_DEBUG, "false")
1397: .equals("true");
1398:
1399: // init nameCacheNoCachingList
1400: String noCachingList = getConfig(
1401: HTTPC_NAME_CACHE_CACHING_PATTERNS_NO, "");
1402: String[] noCachingEntries = noCachingList.split(",");
1403: for (i = 0; i < noCachingEntries.length; i++) {
1404: String entry = noCachingEntries[i].trim();
1405: serverDomains.nameCacheNoCachingPatterns.add(entry);
1406: }
1407:
1408: // generate snippets cache
1409: log.logConfig("Initializing Snippet Cache");
1410: plasmaSnippetCache.init(parser, log);
1411:
1412: String wikiParserClassName = getConfig(WIKIPARSER_CLASS,
1413: WIKIPARSER_CLASS_DEFAULT);
1414: this .log.logConfig("Loading wiki parser " + wikiParserClassName
1415: + " ...");
1416: try {
1417: Class<?> wikiParserClass = Class
1418: .forName(wikiParserClassName);
1419: Constructor<?> wikiParserClassConstr = wikiParserClass
1420: .getConstructor(new Class[] { plasmaSwitchboard.class });
1421: wikiParser = (wikiParser) wikiParserClassConstr
1422: .newInstance(new Object[] { this });
1423: } catch (Exception e) {
1424: this .log.logSevere(
1425: "Unable to load wiki parser, the wiki won't work",
1426: e);
1427: }
1428:
1429: // initializing the stackCrawlThread
1430: this .crawlStacker = new plasmaCrawlStacker(
1431: this ,
1432: this .plasmaPath,
1433: ramPreNURL_time,
1434: (int) getConfigLong("tableTypeForPreNURL", 0),
1435: (((int) getConfigLong("tableTypeForPreNURL", 0) == 0) && (getConfigLong(
1436: CRAWLSTACK_BUSYSLEEP, 0) <= 100)));
1437: //this.sbStackCrawlThread = new plasmaStackCrawlThread(this,this.plasmaPath,ramPreNURL);
1438: //this.sbStackCrawlThread.start();
1439:
1440: // initializing dht chunk generation
1441: this .dhtTransferChunk = null;
1442: this .dhtTransferIndexCount = (int) getConfigLong(
1443: INDEX_DIST_CHUNK_SIZE_START, 50);
1444:
1445: // init robinson cluster
1446: // before we do that, we wait some time until the seed list is loaded.
1447: while (((System.currentTimeMillis() - startedSeedListAquisition) < 8000)
1448: && (yacyCore.seedDB.sizeConnected() == 0))
1449: try {
1450: Thread.sleep(1000);
1451: } catch (InterruptedException e) {
1452: }
1453: try {
1454: Thread.sleep(1000);
1455: } catch (InterruptedException e) {
1456: }
1457: this .clusterhashes = yacyCore.seedDB.clusterHashes(getConfig(
1458: "cluster.peers.yacydomain", ""));
1459:
1460: // deploy threads
1461: log.logConfig("Starting Threads");
1462: serverMemory.gc(1000, "plasmaSwitchboard, help for profiler"); // help for profiler - thq
1463:
1464: moreMemory = new Timer(); // init GC Thread - thq
1465: moreMemory.schedule(new MoreMemory(), 300000, 600000);
1466:
1467: int indexing_cluster = Integer.parseInt(getConfig(
1468: INDEXER_CLUSTER, "1"));
1469: if (indexing_cluster < 1)
1470: indexing_cluster = 1;
1471: deployThread(CLEANUP, "Cleanup",
1472: "simple cleaning process for monitoring information",
1473: null, new serverInstantThread(this ,
1474: CLEANUP_METHOD_START, CLEANUP_METHOD_JOBCOUNT,
1475: CLEANUP_METHOD_FREEMEM), 10000); // all 5 Minutes
1476: deployThread(
1477: CRAWLSTACK,
1478: "Crawl URL Stacker",
1479: "process that checks url for double-occurrences and for allowance/disallowance by robots.txt",
1480: null, new serverInstantThread(crawlStacker,
1481: CRAWLSTACK_METHOD_START,
1482: CRAWLSTACK_METHOD_JOBCOUNT,
1483: CRAWLSTACK_METHOD_FREEMEM), 8000);
1484:
1485: deployThread(
1486: INDEXER,
1487: "Parsing/Indexing",
1488: "thread that performes document parsing and indexing",
1489: "/IndexCreateIndexingQueue_p.html",
1490: new serverInstantThread(this , INDEXER_METHOD_START,
1491: INDEXER_METHOD_JOBCOUNT, INDEXER_METHOD_FREEMEM),
1492: 10000);
1493: for (i = 1; i < indexing_cluster; i++) {
1494: setConfig((i + 80) + "_indexing_idlesleep", getConfig(
1495: INDEXER_IDLESLEEP, ""));
1496: setConfig((i + 80) + "_indexing_busysleep", getConfig(
1497: INDEXER_BUSYSLEEP, ""));
1498: deployThread(
1499: (i + 80) + "_indexing",
1500: "Parsing/Indexing (cluster job)",
1501: "thread that performes document parsing and indexing",
1502: null, new serverInstantThread(this ,
1503: INDEXER_METHOD_START,
1504: INDEXER_METHOD_JOBCOUNT,
1505: INDEXER_METHOD_FREEMEM),
1506: 10000 + (i * 1000), Long.parseLong(getConfig(
1507: INDEXER_IDLESLEEP, "5000")),
1508: Long.parseLong(getConfig(INDEXER_BUSYSLEEP, "0")),
1509: Long.parseLong(getConfig(INDEXER_MEMPREREQ,
1510: "1000000")));
1511: }
1512:
1513: deployThread(
1514: PROXY_CACHE_ENQUEUE,
1515: "Proxy Cache Enqueue",
1516: "job takes new input files from RAM stack, stores them, and hands over to the Indexing Stack",
1517: null, new serverInstantThread(this ,
1518: PROXY_CACHE_ENQUEUE_METHOD_START,
1519: PROXY_CACHE_ENQUEUE_METHOD_JOBCOUNT,
1520: PROXY_CACHE_ENQUEUE_METHOD_FREEMEM), 10000);
1521: deployThread(
1522: CRAWLJOB_REMOTE_TRIGGERED_CRAWL,
1523: "Remote Crawl Job",
1524: "thread that performes a single crawl/indexing step triggered by a remote peer",
1525: null,
1526: new serverInstantThread(
1527: crawlQueues,
1528: CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START,
1529: CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT,
1530: CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM),
1531: 30000);
1532: deployThread(
1533: CRAWLJOB_REMOTE_CRAWL_LOADER,
1534: "Remote Crawl URL Loader",
1535: "thread that loads remote crawl lists from other peers",
1536: "", new serverInstantThread(crawlQueues,
1537: CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START,
1538: CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT,
1539: CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM),
1540: 30000); // error here?
1541: deployThread(
1542: CRAWLJOB_LOCAL_CRAWL,
1543: "Local Crawl",
1544: "thread that performes a single crawl step from the local crawl queue",
1545: "/IndexCreateWWWLocalQueue_p.html",
1546: new serverInstantThread(crawlQueues,
1547: CRAWLJOB_LOCAL_CRAWL_METHOD_START,
1548: CRAWLJOB_LOCAL_CRAWL_METHOD_JOBCOUNT,
1549: CRAWLJOB_LOCAL_CRAWL_METHOD_FREEMEM), 10000);
1550: deployThread(
1551: SEED_UPLOAD,
1552: "Seed-List Upload",
1553: "task that a principal peer performes to generate and upload a seed-list to a ftp account",
1554: null, new serverInstantThread(yc,
1555: SEED_UPLOAD_METHOD_START,
1556: SEED_UPLOAD_METHOD_JOBCOUNT,
1557: SEED_UPLOAD_METHOD_FREEMEM), 180000);
1558: serverInstantThread peerPing = null;
1559: deployThread(PEER_PING, "YaCy Core",
1560: "this is the p2p-control and peer-ping task", null,
1561: peerPing = new serverInstantThread(yc,
1562: PEER_PING_METHOD_START,
1563: PEER_PING_METHOD_JOBCOUNT,
1564: PEER_PING_METHOD_FREEMEM), 2000);
1565: peerPing.setSyncObject(new Object());
1566:
1567: deployThread(
1568: INDEX_DIST,
1569: "DHT Distribution",
1570: "selection, transfer and deletion of index entries that are not searched on your peer, but on others",
1571: null, new serverInstantThread(this ,
1572: INDEX_DIST_METHOD_START,
1573: INDEX_DIST_METHOD_JOBCOUNT,
1574: INDEX_DIST_METHOD_FREEMEM), 60000, Long
1575: .parseLong(getConfig(INDEX_DIST_IDLESLEEP,
1576: "5000")), Long.parseLong(getConfig(
1577: INDEX_DIST_BUSYSLEEP, "0")), Long
1578: .parseLong(getConfig(INDEX_DIST_MEMPREREQ,
1579: "1000000")));
1580:
1581: // test routine for snippet fetch
1582: //Set query = new HashSet();
1583: //query.add(plasmaWordIndexEntry.word2hash("Weitergabe"));
1584: //query.add(plasmaWordIndexEntry.word2hash("Zahl"));
1585: //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true);
1586: //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
1587: //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260);
1588:
1589: this .dbImportManager = new dbImportManager(this );
1590:
1591: log.logConfig("Finished Switchboard Initialization");
1592: }
1593:
1594: public void initMessages(long ramMessage_time) {
1595: this .log.logConfig("Starting Message Board");
1596: File messageDbFile = new File(workPath, DBFILE_MESSAGE);
1597: this .messageDB = new messageBoard(messageDbFile,
1598: ramMessage_time);
1599: this .log.logConfig("Loaded Message Board DB from file "
1600: + messageDbFile.getName() + ", "
1601: + this .messageDB.size() + " entries" + ", "
1602: + ppRamString(messageDbFile.length() / 1024));
1603: }
1604:
1605: public void initWiki(long ramWiki_time) {
1606: this .log.logConfig("Starting Wiki Board");
1607: File wikiDbFile = new File(workPath, DBFILE_WIKI);
1608: this .wikiDB = new wikiBoard(wikiDbFile, new File(workPath,
1609: DBFILE_WIKI_BKP), ramWiki_time);
1610: this .log.logConfig("Loaded Wiki Board DB from file "
1611: + wikiDbFile.getName() + ", " + this .wikiDB.size()
1612: + " entries" + ", "
1613: + ppRamString(wikiDbFile.length() / 1024));
1614: }
1615:
1616: public void initBlog(long ramBlog_time) {
1617: this .log.logConfig("Starting Blog");
1618: File blogDbFile = new File(workPath, DBFILE_BLOG);
1619: this .blogDB = new blogBoard(blogDbFile, ramBlog_time);
1620: this .log.logConfig("Loaded Blog DB from file "
1621: + blogDbFile.getName() + ", " + this .blogDB.size()
1622: + " entries" + ", "
1623: + ppRamString(blogDbFile.length() / 1024));
1624:
1625: File blogCommentDbFile = new File(workPath, DBFILE_BLOGCOMMENTS);
1626: this .blogCommentDB = new blogBoardComments(blogCommentDbFile,
1627: ramBlog_time);
1628: this .log.logConfig("Loaded Blog-Comment DB from file "
1629: + blogCommentDbFile.getName() + ", "
1630: + this .blogCommentDB.size() + " entries" + ", "
1631: + ppRamString(blogCommentDbFile.length() / 1024));
1632: }
1633:
1634: public void initBookmarks() {
1635: this .log.logConfig("Loading Bookmarks DB");
1636: File bookmarksFile = new File(workPath, DBFILE_BOOKMARKS);
1637: File tagsFile = new File(workPath, DBFILE_BOOKMARKS_TAGS);
1638: File datesFile = new File(workPath, DBFILE_BOOKMARKS_DATES);
1639: this .bookmarksDB = new bookmarksDB(bookmarksFile, tagsFile,
1640: datesFile, 2000);
1641: this .log.logConfig("Loaded Bookmarks DB from files "
1642: + bookmarksFile.getName() + ", " + tagsFile.getName());
1643: this .log.logConfig(this .bookmarksDB.tagsSize() + " Tag, "
1644: + this .bookmarksDB.bookmarksSize() + " Bookmarks");
1645: }
1646:
1647: public static plasmaSwitchboard getSwitchboard() {
1648: return sb;
1649: }
1650:
1651: public boolean isRobinsonMode() {
1652: // we are in robinson mode, if we do not exchange index by dht distribution
1653: // we need to take care that search requests and remote indexing requests go only
1654: // to the peers in the same cluster, if we run a robinson cluster.
1655: return !getConfigBool(plasmaSwitchboard.INDEX_DIST_ALLOW, false)
1656: && !getConfigBool(
1657: plasmaSwitchboard.INDEX_RECEIVE_ALLOW, false);
1658: }
1659:
1660: public boolean isPublicRobinson() {
1661: // robinson peers may be member of robinson clusters, which can be public or private
1662: // this does not check the robinson attribute, only the specific subtype of the cluster
1663: String clustermode = getConfig(CLUSTER_MODE,
1664: CLUSTER_MODE_PUBLIC_PEER);
1665: return (clustermode.equals(CLUSTER_MODE_PUBLIC_CLUSTER))
1666: || (clustermode.equals(CLUSTER_MODE_PUBLIC_PEER));
1667: }
1668:
1669: public boolean isInMyCluster(String peer) {
1670: // check if the given peer is in the own network, if this is a robinson cluster
1671: // depending on the robinson cluster type, the peer String may be a peerhash (b64-hash)
1672: // or a ip:port String or simply a ip String
1673: // if this robinson mode does not define a cluster membership, false is returned
1674: if (peer == null)
1675: return false;
1676: if (!isRobinsonMode())
1677: return false;
1678: String clustermode = getConfig(CLUSTER_MODE,
1679: CLUSTER_MODE_PUBLIC_PEER);
1680: if (clustermode.equals(CLUSTER_MODE_PRIVATE_CLUSTER)) {
1681: // check if we got the request from a peer in the private cluster
1682: String network = getConfig(CLUSTER_PEERS_IPPORT, "");
1683: return network.indexOf(peer) >= 0;
1684: } else if (clustermode.equals(CLUSTER_MODE_PUBLIC_CLUSTER)) {
1685: // check if we got the request from a peer in the public cluster
1686: return this .clusterhashes.containsKey(peer);
1687: } else {
1688: return false;
1689: }
1690: }
1691:
1692: public boolean isInMyCluster(yacySeed seed) {
1693: // check if the given peer is in the own network, if this is a robinson cluster
1694: // if this robinson mode does not define a cluster membership, false is returned
1695: if (seed == null)
1696: return false;
1697: if (!isRobinsonMode())
1698: return false;
1699: String clustermode = getConfig(CLUSTER_MODE,
1700: CLUSTER_MODE_PUBLIC_PEER);
1701: if (clustermode.equals(CLUSTER_MODE_PRIVATE_CLUSTER)) {
1702: // check if we got the request from a peer in the private cluster
1703: String network = getConfig(CLUSTER_PEERS_IPPORT, "");
1704: return network.indexOf(seed.getPublicAddress()) >= 0;
1705: } else if (clustermode.equals(CLUSTER_MODE_PUBLIC_CLUSTER)) {
1706: // check if we got the request from a peer in the public cluster
1707: return this .clusterhashes.containsKey(seed.hash);
1708: } else {
1709: return false;
1710: }
1711: }
1712:
1713: public boolean acceptURL(yacyURL url) {
1714: // returns true if the url can be accepted accoring to network.unit.domain
1715: if (url == null)
1716: return false;
1717: String host = url.getHost();
1718: if (host == null)
1719: return false;
1720: if (this .acceptGlobalURLs && this .acceptLocalURLs)
1721: return true; // fast shortcut to avoid dnsResolve
1722: InetAddress hostAddress = serverDomains.dnsResolve(host);
1723: // if we don't know the host, we cannot load that resource anyway.
1724: // But in case we use a proxy, it is possible that we dont have a DNS service.
1725: if (hostAddress == null)
1726: return ((this .remoteProxyConfig != null) && (this .remoteProxyConfig
1727: .useProxy()));
1728: // check if this is a local address and we are allowed to index local pages:
1729: boolean local = hostAddress.isSiteLocalAddress()
1730: || hostAddress.isLoopbackAddress();
1731: return (this .acceptGlobalURLs && !local)
1732: || (this .acceptLocalURLs && local);
1733: }
1734:
1735: public String urlExists(String hash) {
1736: // tests if hash occurrs in any database
1737: // if it exists, the name of the database is returned,
1738: // if it not exists, null is returned
1739: if (wordIndex.loadedURL.exists(hash))
1740: return "loaded";
1741: return this .crawlQueues.urlExists(hash);
1742: }
1743:
1744: public void urlRemove(String hash) {
1745: wordIndex.loadedURL.remove(hash);
1746: crawlQueues.urlRemove(hash);
1747: }
1748:
1749: public yacyURL getURL(String urlhash) {
1750: if (urlhash == null)
1751: return null;
1752: if (urlhash.equals(yacyURL.dummyHash))
1753: return null;
1754: yacyURL ne = crawlQueues.getURL(urlhash);
1755: if (ne != null)
1756: return ne;
1757: indexURLEntry le = wordIndex.loadedURL.load(urlhash, null, 0);
1758: if (le != null)
1759: return le.comp().url();
1760: return null;
1761: }
1762:
1763: public plasmaSearchRankingProfile getRanking() {
1764: return (getConfig("rankingProfile", "").length() == 0) ? new plasmaSearchRankingProfile(
1765: plasmaSearchQuery.CONTENTDOM_TEXT)
1766: : new plasmaSearchRankingProfile("", crypt
1767: .simpleDecode(sb
1768: .getConfig("rankingProfile", ""), null));
1769: }
1770:
1771: /**
1772: * This method changes the HTCache size.<br>
1773: * @param newCacheSize in MB
1774: */
1775: public final void setCacheSize(long newCacheSize) {
1776: plasmaHTCache.setCacheSize(1048576 * newCacheSize);
1777: }
1778:
1779: public boolean onlineCaution() {
1780: return (System.currentTimeMillis() - this .proxyLastAccess < Integer
1781: .parseInt(getConfig(PROXY_ONLINE_CAUTION_DELAY, "30000")))
1782: || (System.currentTimeMillis()
1783: - this .localSearchLastAccess < Integer
1784: .parseInt(getConfig(
1785: LOCALSEACH_ONLINE_CAUTION_DELAY,
1786: "30000")))
1787: || (System.currentTimeMillis()
1788: - this .remoteSearchLastAccess < Integer
1789: .parseInt(getConfig(
1790: REMOTESEARCH_ONLINE_CAUTION_DELAY,
1791: "30000")));
1792: }
1793:
1794: private static String ppRamString(long bytes) {
1795: if (bytes < 1024)
1796: return bytes + " KByte";
1797: bytes = bytes / 1024;
1798: if (bytes < 1024)
1799: return bytes + " MByte";
1800: bytes = bytes / 1024;
1801: if (bytes < 1024)
1802: return bytes + " GByte";
1803: return (bytes / 1024) + "TByte";
1804: }
1805:
1806: private void initActiveCrawlProfiles() {
1807: this .defaultProxyProfile = null;
1808: this .defaultRemoteProfile = null;
1809: this .defaultTextSnippetProfile = null;
1810: this .defaultMediaSnippetProfile = null;
1811: Iterator<plasmaCrawlProfile.entry> i = this .profilesActiveCrawls
1812: .profiles(true);
1813: plasmaCrawlProfile.entry profile;
1814: String name;
1815: while (i.hasNext()) {
1816: profile = i.next();
1817: name = profile.name();
1818: if (name.equals(CRAWL_PROFILE_PROXY))
1819: this .defaultProxyProfile = profile;
1820: if (name.equals(CRAWL_PROFILE_REMOTE))
1821: this .defaultRemoteProfile = profile;
1822: if (name.equals(CRAWL_PROFILE_SNIPPET_TEXT))
1823: this .defaultTextSnippetProfile = profile;
1824: if (name.equals(CRAWL_PROFILE_SNIPPET_MEDIA))
1825: this .defaultMediaSnippetProfile = profile;
1826: }
1827: if (this .defaultProxyProfile == null) {
1828: // generate new default entry for proxy crawling
1829: this .defaultProxyProfile = this .profilesActiveCrawls
1830: .newEntry("proxy", null, ".*", ".*", Integer
1831: .parseInt(getConfig(PROXY_PREFETCH_DEPTH,
1832: "0")), Integer.parseInt(getConfig(
1833: PROXY_PREFETCH_DEPTH, "0")), 60 * 24, -1,
1834: -1, false, getConfigBool(
1835: PROXY_INDEXING_LOCAL_TEXT, true),
1836: getConfigBool(PROXY_INDEXING_LOCAL_MEDIA,
1837: true), true, true, getConfigBool(
1838: PROXY_INDEXING_REMOTE, false),
1839: true, true, true);
1840: }
1841: if (this .defaultRemoteProfile == null) {
1842: // generate new default entry for remote crawling
1843: defaultRemoteProfile = this .profilesActiveCrawls.newEntry(
1844: CRAWL_PROFILE_REMOTE, null, ".*", ".*", 0, 0, -1,
1845: -1, -1, true, true, true, false, true, false, true,
1846: true, false);
1847: }
1848: if (this .defaultTextSnippetProfile == null) {
1849: // generate new default entry for snippet fetch and optional crawling
1850: defaultTextSnippetProfile = this .profilesActiveCrawls
1851: .newEntry(CRAWL_PROFILE_SNIPPET_TEXT, null, ".*",
1852: ".*", 0, 0, 60 * 24 * 30, -1, -1, true,
1853: true, true, true, true, false, true, true,
1854: false);
1855: }
1856: if (this .defaultMediaSnippetProfile == null) {
1857: // generate new default entry for snippet fetch and optional crawling
1858: defaultMediaSnippetProfile = this .profilesActiveCrawls
1859: .newEntry(CRAWL_PROFILE_SNIPPET_MEDIA, null, ".*",
1860: ".*", 0, 0, 60 * 24 * 30, -1, -1, true,
1861: false, true, true, true, false, true, true,
1862: false);
1863: }
1864: }
1865:
1866: private void resetProfiles() {
1867: final File pdb = new File(plasmaPath,
1868: DBFILE_ACTIVE_CRAWL_PROFILES);
1869: if (pdb.exists())
1870: pdb.delete();
1871: long ramProfiles_time = getConfigLong(RAM_CACHE_PROFILES_TIME,
1872: 1000);
1873: profilesActiveCrawls = new plasmaCrawlProfile(pdb,
1874: ramProfiles_time);
1875: initActiveCrawlProfiles();
1876: }
1877:
1878: /**
1879: * {@link plasmaCrawlProfile Crawl Profiles} are saved independantly from the queues themselves
1880: * and therefore have to be cleaned up from time to time. This method only performs the clean-up
1881: * if - and only if - the {@link plasmaSwitchboardQueue switchboard},
1882: * {@link plasmaProtocolLoader loader} and {@link plasmaCrawlNURL local crawl} queues are all empty.
1883: * <p>
1884: * Then it iterates through all existing {@link plasmaCrawlProfile crawl profiles} and removes
1885: * all profiles which are not hardcoded.
1886: * </p>
1887: * <p>
1888: * <i>If this method encounters DB-failures, the profile DB will be resetted and</i>
1889: * <code>true</code><i> will be returned</i>
1890: * </p>
1891: * @see #CRAWL_PROFILE_PROXY hardcoded
1892: * @see #CRAWL_PROFILE_REMOTE hardcoded
1893: * @see #CRAWL_PROFILE_SNIPPET_TEXT hardcoded
1894: * @see #CRAWL_PROFILE_SNIPPET_MEDIA hardcoded
1895: * @return whether this method has done something or not (i.e. because the queues have been filled
1896: * or there are no profiles left to clean up)
1897: * @throws <b>InterruptedException</b> if the current thread has been interrupted, i.e. by the
1898: * shutdown procedure
1899: */
1900: public boolean cleanProfiles() throws InterruptedException {
1901: if ((sbQueue.size() > 0) || (crawlQueues.size() > 0)
1902: || (crawlStacker != null && crawlStacker.size() > 0)
1903: || (crawlQueues.noticeURL.notEmpty()))
1904: return false;
1905: final Iterator<plasmaCrawlProfile.entry> iter = profilesActiveCrawls
1906: .profiles(true);
1907: plasmaCrawlProfile.entry entry;
1908: boolean hasDoneSomething = false;
1909: try {
1910: while (iter.hasNext()) {
1911: // check for interruption
1912: if (Thread.currentThread().isInterrupted())
1913: throw new InterruptedException(
1914: "Shutdown in progress");
1915:
1916: // getting next profile
1917: entry = iter.next();
1918: if (!((entry.name().equals(CRAWL_PROFILE_PROXY))
1919: || (entry.name().equals(CRAWL_PROFILE_REMOTE))
1920: || (entry.name()
1921: .equals(CRAWL_PROFILE_SNIPPET_TEXT)) || (entry
1922: .name().equals(CRAWL_PROFILE_SNIPPET_MEDIA)))) {
1923: profilesPassiveCrawls.newEntry(entry.map());
1924: iter.remove();
1925: hasDoneSomething = true;
1926: }
1927: }
1928: } catch (kelondroException e) {
1929: resetProfiles();
1930: hasDoneSomething = true;
1931: }
1932: return hasDoneSomething;
1933: }
1934:
1935: synchronized public boolean htEntryStoreProcess(
1936: plasmaHTCache.Entry entry) {
1937:
1938: if (entry == null)
1939: return false;
1940:
1941: /* =========================================================================
1942: * PARSER SUPPORT
1943: *
1944: * Testing if the content type is supported by the available parsers
1945: * ========================================================================= */
1946: boolean isSupportedContent = plasmaParser.supportedContent(
1947: entry.url(), entry.getMimeType());
1948:
1949: /* =========================================================================
1950: * INDEX CONTROL HEADER
1951: *
1952: * With the X-YACY-Index-Control header set to "no-index" a client could disallow
1953: * yacy to index the response returned as answer to a request
1954: * ========================================================================= */
1955: boolean doIndexing = true;
1956: if (entry.requestProhibitsIndexing()) {
1957: doIndexing = false;
1958: if (this .log.isFine())
1959: this .log.logFine("Crawling of " + entry.url()
1960: + " prohibited by request.");
1961: }
1962:
1963: /* =========================================================================
1964: * LOCAL IP ADDRESS CHECK
1965: *
1966: * check if ip is local ip address // TODO: remove this procotol specific code here
1967: * ========================================================================= */
1968: if (!acceptURL(entry.url())) {
1969: this .log.logFine("Host in URL '" + entry.url()
1970: + "' is not in defined indexing domain.");
1971: doIndexing = false;
1972: }
1973:
1974: /* =========================================================================
1975: * STORING DATA
1976: *
1977: * Now we store the response header and response content if
1978: * a) the user has configured to use the htcache or
1979: * b) the content should be indexed
1980: * ========================================================================= */
1981: if ((entry.profile().storeHTCache())
1982: || (doIndexing && isSupportedContent)) {
1983: // store response header
1984: if (entry.writeResourceInfo()) {
1985: this .log.logInfo("WROTE HEADER for "
1986: + entry.cacheFile());
1987: }
1988:
1989: // work off unwritten files
1990: if (entry.cacheArray() == null) {
1991: //this.log.logFine("EXISTING FILE (" + entry.cacheFile.length() + " bytes) for " + entry.cacheFile);
1992: } else {
1993: String error = entry.shallStoreCacheForProxy();
1994: if (error == null) {
1995: plasmaHTCache.writeResourceContent(entry.url(),
1996: entry.cacheArray());
1997: this .log.logFine("WROTE FILE ("
1998: + entry.cacheArray().length
1999: + " bytes) for " + entry.cacheFile());
2000: } else {
2001: this .log.logFine("WRITE OF FILE "
2002: + entry.cacheFile() + " FORBIDDEN: "
2003: + error);
2004: }
2005: }
2006: }
2007:
2008: /* =========================================================================
2009: * INDEXING
2010: * ========================================================================= */
2011: if (doIndexing && isSupportedContent) {
2012:
2013: // registering the cachefile as in use
2014: if (entry.cacheFile().exists()) {
2015: plasmaHTCache.filesInUse.add(entry.cacheFile());
2016: }
2017:
2018: // enqueue for further crawling
2019: enQueue(this .sbQueue.newEntry(entry.url(), (entry
2020: .referrerURL() == null) ? null : entry
2021: .referrerURL().hash(), entry.ifModifiedSince(),
2022: entry.requestWithCookie(), entry.initiator(), entry
2023: .depth(), entry.profile().handle(), entry
2024: .name()));
2025: } else {
2026: if (!entry.profile().storeHTCache()
2027: && entry.cacheFile().exists()) {
2028: plasmaHTCache.deleteURLfromCache(entry.url());
2029: }
2030: }
2031:
2032: return true;
2033: }
2034:
2035: public boolean htEntryStoreJob() {
2036: if (plasmaHTCache.empty())
2037: return false;
2038: return htEntryStoreProcess(plasmaHTCache.pop());
2039: }
2040:
2041: public int htEntrySize() {
2042: return plasmaHTCache.size();
2043: }
2044:
2045: public void close() {
2046: log
2047: .logConfig("SWITCHBOARD SHUTDOWN STEP 1: sending termination signal to managed threads:");
2048: serverProfiling.stopSystemProfiling();
2049: moreMemory.cancel();
2050: terminateAllThreads(true);
2051: if (transferIdxThread != null)
2052: stopTransferWholeIndex(false);
2053: log
2054: .logConfig("SWITCHBOARD SHUTDOWN STEP 2: sending termination signal to threaded indexing");
2055: // closing all still running db importer jobs
2056: this .dbImportManager.close();
2057: crawlQueues.close();
2058: wikiDB.close();
2059: blogDB.close();
2060: blogCommentDB.close();
2061: userDB.close();
2062: bookmarksDB.close();
2063: messageDB.close();
2064: if (facilityDB != null)
2065: facilityDB.close();
2066: crawlStacker.close();
2067: profilesActiveCrawls.close();
2068: robots.close();
2069: parser.close();
2070: plasmaHTCache.close();
2071: sbQueue.close();
2072: httpc.closeAllConnections();
2073: webStructure.flushCitationReference("crg");
2074: webStructure.close();
2075: log
2076: .logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
2077: wordIndex.close();
2078: yc.close();
2079: log.logConfig("SWITCHBOARD SHUTDOWN TERMINATED");
2080: }
2081:
2082: public int queueSize() {
2083: return sbQueue.size();
2084: }
2085:
2086: public void enQueue(Object job) {
2087: assert job != null;
2088: if (!(job instanceof plasmaSwitchboardQueue.Entry)) {
2089: System.out
2090: .println("Internal error at plasmaSwitchboard.enQueue: wrong job type");
2091: System.exit(0);
2092: }
2093: try {
2094: sbQueue.push((plasmaSwitchboardQueue.Entry) job);
2095: } catch (IOException e) {
2096: log.logSevere("IOError in plasmaSwitchboard.enQueue: "
2097: + e.getMessage(), e);
2098: }
2099: }
2100:
2101: public void deQueueFreeMem() {
2102: // flush some entries from the RAM cache
2103: wordIndex.flushCacheSome();
2104: // adopt maximum cache size to current size to prevent that further OutOfMemoryErrors occur
2105: /* int newMaxCount = Math.max(1200, Math.min((int) getConfigLong(WORDCACHE_MAX_COUNT, 1200), wordIndex.dhtOutCacheSize()));
2106: setConfig(WORDCACHE_MAX_COUNT, Integer.toString(newMaxCount));
2107: wordIndex.setMaxWordCount(newMaxCount); */
2108: }
2109:
2110: public boolean deQueue() {
2111: try {
2112: // work off fresh entries from the proxy or from the crawler
2113: if (onlineCaution()) {
2114: log
2115: .logFine("deQueue: online caution, omitting resource stack processing");
2116: return false;
2117: }
2118:
2119: // flush some entries from the RAM cache
2120: if (sbQueue.size() == 0)
2121: wordIndex.flushCacheSome(); // permanent flushing only if we are not busy
2122:
2123: boolean doneSomething = false;
2124:
2125: // possibly delete entries from last chunk
2126: if ((this .dhtTransferChunk != null)
2127: && (this .dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE)) {
2128: String deletedURLs = this .dhtTransferChunk
2129: .deleteTransferIndexes();
2130: this .log.logFine("Deleted from "
2131: + this .dhtTransferChunk.containers().length
2132: + " transferred RWIs locally, removed "
2133: + deletedURLs + " URL references");
2134: this .dhtTransferChunk = null;
2135: }
2136:
2137: // generate a dht chunk
2138: if ((dhtShallTransfer() == null)
2139: && ((this .dhtTransferChunk == null)
2140: || (this .dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_UNDEFINED) ||
2141: // (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE) ||
2142: (this .dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_FAILED))) {
2143: // generate new chunk
2144: int minChunkSize = (int) getConfigLong(
2145: INDEX_DIST_CHUNK_SIZE_MIN, 30);
2146: dhtTransferChunk = new plasmaDHTChunk(this .log,
2147: wordIndex, minChunkSize, dhtTransferIndexCount,
2148: 5000);
2149: doneSomething = true;
2150: }
2151:
2152: // check for interruption
2153: checkInterruption();
2154:
2155: // getting the next entry from the indexing queue
2156: synchronized (sbQueue) {
2157:
2158: if (sbQueue.size() == 0) {
2159: //log.logFine("deQueue: nothing to do, queue is emtpy");
2160: return doneSomething; // nothing to do
2161: }
2162:
2163: if (crawlStacker.size() >= getConfigLong(
2164: CRAWLSTACK_SLOTS, 2000)) {
2165: log
2166: .logFine("deQueue: too many processes in stack crawl thread queue ("
2167: + "stackCrawlQueue="
2168: + crawlStacker.size() + ")");
2169: return doneSomething;
2170: }
2171:
2172: plasmaSwitchboardQueue.Entry nextentry;
2173:
2174: // if we were interrupted we should return now
2175: if (Thread.currentThread().isInterrupted()) {
2176: log.logFine("deQueue: thread was interrupted");
2177: return false;
2178: }
2179:
2180: // do one processing step
2181: log
2182: .logFine("DEQUEUE: sbQueueSize="
2183: + sbQueue.size()
2184: + ", coreStackSize="
2185: + crawlQueues.noticeURL
2186: .stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)
2187: + ", limitStackSize="
2188: + crawlQueues.noticeURL
2189: .stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT)
2190: + ", overhangStackSize="
2191: + crawlQueues.noticeURL
2192: .stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG)
2193: + ", remoteStackSize="
2194: + crawlQueues.noticeURL
2195: .stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
2196: try {
2197: int sizeBefore = sbQueue.size();
2198: nextentry = sbQueue.pop();
2199: if (nextentry == null) {
2200: log
2201: .logWarning("deQueue: null entry on queue stack.");
2202: if (sbQueue.size() == sizeBefore) {
2203: // this is a severe problem: because this time a null is returned, it means that this status will last forever
2204: // to re-enable use of the sbQueue, it must be emptied completely
2205: log
2206: .logSevere("deQueue: does not shrink after pop() == null. Emergency reset.");
2207: sbQueue.clear();
2208: }
2209: return false;
2210: }
2211: } catch (IOException e) {
2212: log.logSevere(
2213: "IOError in plasmaSwitchboard.deQueue: "
2214: + e.getMessage(), e);
2215: return doneSomething;
2216: }
2217:
2218: synchronized (this .indexingTasksInProcess) {
2219: this .indexingTasksInProcess.put(
2220: nextentry.urlHash(), nextentry);
2221: }
2222:
2223: // parse and index the resource
2224: processResourceStack(nextentry);
2225: }
2226: return true;
2227: } catch (InterruptedException e) {
2228: log.logInfo("DEQUEUE: Shutdown detected.");
2229: return false;
2230: }
2231: }
2232:
2233: public int cleanupJobSize() {
2234: int c = 0;
2235: if ((crawlQueues.delegatedURL.stackSize() > 1000))
2236: c++;
2237: if ((crawlQueues.errorURL.stackSize() > 1000))
2238: c++;
2239: for (int i = 1; i <= 6; i++) {
2240: if (wordIndex.loadedURL.getStackSize(i) > 1000)
2241: c++;
2242: }
2243: return c;
2244: }
2245:
2246: public boolean cleanupJob() {
2247: try {
2248: boolean hasDoneSomething = false;
2249:
2250: // do transmission of CR-files
2251: checkInterruption();
2252: int count = rankingOwnDistribution.size() / 100;
2253: if (count == 0)
2254: count = 1;
2255: if (count > 5)
2256: count = 5;
2257: if (rankingOn) {
2258: rankingOwnDistribution.transferRanking(count);
2259: rankingOtherDistribution.transferRanking(1);
2260: }
2261:
2262: // clean up delegated stack
2263: checkInterruption();
2264: if ((crawlQueues.delegatedURL.stackSize() > 1000)) {
2265: log.logFine("Cleaning Delegated-URLs report stack, "
2266: + crawlQueues.delegatedURL.stackSize()
2267: + " entries on stack");
2268: crawlQueues.delegatedURL.clearStack();
2269: hasDoneSomething = true;
2270: }
2271:
2272: // clean up error stack
2273: checkInterruption();
2274: if ((crawlQueues.errorURL.stackSize() > 1000)) {
2275: log.logFine("Cleaning Error-URLs report stack, "
2276: + crawlQueues.errorURL.stackSize()
2277: + " entries on stack");
2278: crawlQueues.errorURL.clearStack();
2279: hasDoneSomething = true;
2280: }
2281:
2282: // clean up loadedURL stack
2283: for (int i = 1; i <= 6; i++) {
2284: checkInterruption();
2285: if (wordIndex.loadedURL.getStackSize(i) > 1000) {
2286: log.logFine("Cleaning Loaded-URLs report stack, "
2287: + wordIndex.loadedURL.getStackSize(i)
2288: + " entries on stack " + i);
2289: wordIndex.loadedURL.clearStack(i);
2290: hasDoneSomething = true;
2291: }
2292: }
2293:
2294: // clean up profiles
2295: checkInterruption();
2296: if (cleanProfiles())
2297: hasDoneSomething = true;
2298:
2299: // clean up news
2300: checkInterruption();
2301: try {
2302: log.logFine("Cleaning Incoming News, "
2303: + yacyCore.newsPool
2304: .size(yacyNewsPool.INCOMING_DB)
2305: + " entries on stack");
2306: if (yacyCore.newsPool.automaticProcess() > 0)
2307: hasDoneSomething = true;
2308: } catch (IOException e) {
2309: }
2310: if (getConfigBool("cleanup.deletionProcessedNews", true)) {
2311: yacyCore.newsPool.clear(yacyNewsPool.PROCESSED_DB);
2312: }
2313: if (getConfigBool("cleanup.deletionPublishedNews", true)) {
2314: yacyCore.newsPool.clear(yacyNewsPool.PUBLISHED_DB);
2315: }
2316:
2317: // clean up seed-dbs
2318: if (getConfigBool("routing.deleteOldSeeds.permission", true)) {
2319: final long deleteOldSeedsTime = getConfigLong(
2320: "routing.deleteOldSeeds.time", 7) * 24 * 3600000;
2321: Iterator<yacySeed> e = yacyCore.seedDB
2322: .seedsSortedDisconnected(true,
2323: yacySeed.LASTSEEN);
2324: yacySeed seed = null;
2325: ArrayList<String> deleteQueue = new ArrayList<String>();
2326: checkInterruption();
2327: //clean passive seeds
2328: while (e.hasNext()) {
2329: seed = e.next();
2330: if (seed != null) {
2331: //list is sorted -> break when peers are too young to delete
2332: if (seed.getLastSeenUTC() > (System
2333: .currentTimeMillis() - deleteOldSeedsTime))
2334: break;
2335: deleteQueue.add(seed.hash);
2336: }
2337: }
2338: for (int i = 0; i < deleteQueue.size(); ++i)
2339: yacyCore.seedDB
2340: .removeDisconnected((String) deleteQueue
2341: .get(i));
2342: deleteQueue.clear();
2343: e = yacyCore.seedDB.seedsSortedPotential(true,
2344: yacySeed.LASTSEEN);
2345: checkInterruption();
2346: //clean potential seeds
2347: while (e.hasNext()) {
2348: seed = (yacySeed) e.next();
2349: if (seed != null) {
2350: //list is sorted -> break when peers are too young to delete
2351: if (seed.getLastSeenUTC() > (System
2352: .currentTimeMillis() - deleteOldSeedsTime))
2353: break;
2354: deleteQueue.add(seed.hash);
2355: }
2356: }
2357: for (int i = 0; i < deleteQueue.size(); ++i)
2358: yacyCore.seedDB
2359: .removePotential((String) deleteQueue
2360: .get(i));
2361: }
2362:
2363: // check if update is available and
2364: // if auto-update is activated perform an automatic installation and restart
2365: yacyVersion updateVersion = yacyVersion
2366: .rulebasedUpdateInfo(false);
2367: if (updateVersion != null)
2368: try {
2369: // there is a version that is more recent. Load it and re-start with it
2370: log
2371: .logInfo("AUTO-UPDATE: downloading more recent release "
2372: + updateVersion.url);
2373: yacyVersion.downloadRelease(updateVersion);
2374: File releaseFile = new File(sb.getRootPath(),
2375: "DATA/RELEASE/" + updateVersion.name);
2376: boolean devenvironment = yacyVersion
2377: .combined2prettyVersion(
2378: sb.getConfig("version", "0.1"))
2379: .startsWith("dev");
2380: if (devenvironment) {
2381: log
2382: .logInfo("AUTO-UPDATE: omiting update because this is a development environment");
2383: } else if ((!releaseFile.exists())
2384: || (releaseFile.length() == 0)) {
2385: log
2386: .logInfo("AUTO-UPDATE: omiting update because download failed (file cannot be found or is too small)");
2387: } else {
2388: yacyVersion.deployRelease(updateVersion.name);
2389: terminate(5000);
2390: log
2391: .logInfo("AUTO-UPDATE: deploy and restart initiated");
2392: }
2393: } catch (IOException e) {
2394: log
2395: .logSevere("AUTO-UPDATE: could not download and install release "
2396: + updateVersion.url
2397: + ": "
2398: + e.getMessage());
2399: }
2400:
2401: // initiate broadcast about peer startup to spread supporter url
2402: if (yacyCore.newsPool.size(yacyNewsPool.OUTGOING_DB) == 0) {
2403: // read profile
2404: final Properties profile = new Properties();
2405: FileInputStream fileIn = null;
2406: try {
2407: fileIn = new FileInputStream(new File(
2408: "DATA/SETTINGS/profile.txt"));
2409: profile.load(fileIn);
2410: } catch (IOException e) {
2411: } finally {
2412: if (fileIn != null)
2413: try {
2414: fileIn.close();
2415: } catch (Exception e) {
2416: }
2417: }
2418: String homepage = (String) profile.get("homepage");
2419: if ((homepage != null) && (homepage.length() > 10)) {
2420: Properties news = new Properties();
2421: news.put("homepage", profile.get("homepage"));
2422: yacyCore.newsPool
2423: .publishMyNews(yacyNewsRecord
2424: .newRecord(
2425: yacyNewsPool.CATEGORY_PROFILE_BROADCAST,
2426: news));
2427: }
2428: }
2429: /*
2430: // set a maximum amount of memory for the caches
2431: // long memprereq = Math.max(getConfigLong(INDEXER_MEMPREREQ, 0), wordIndex.minMem());
2432: // setConfig(INDEXER_MEMPREREQ, memprereq);
2433: // setThreadPerformance(INDEXER, getConfigLong(INDEXER_IDLESLEEP, 0), getConfigLong(INDEXER_BUSYSLEEP, 0), memprereq);
2434: kelondroCachedRecords.setCacheGrowStati(40 * 1024 * 1024, 20 * 1024 * 1024);
2435: kelondroCache.setCacheGrowStati(40 * 1024 * 1024, 20 * 1024 * 1024);
2436: */
2437: // update the cluster set
2438: this .clusterhashes = yacyCore.seedDB
2439: .clusterHashes(getConfig(
2440: "cluster.peers.yacydomain", ""));
2441:
2442: return hasDoneSomething;
2443: } catch (InterruptedException e) {
2444: this .log.logInfo("cleanupJob: Shutdown detected");
2445: return false;
2446: }
2447: }
2448:
2449: /**
2450: * Creates a new File instance with absolute path of ours Seed File.<br>
2451: * @return a new File instance
2452: */
2453: public File getOwnSeedFile() {
2454: return getConfigPath(OWN_SEED_FILE, DBFILE_OWN_SEED);
2455: }
2456:
2457: /**
2458: * With this function the crawling process can be paused
2459: */
2460: public void pauseCrawlJob(String jobType) {
2461: Object[] status = (Object[]) this .crawlJobsStatus.get(jobType);
2462: synchronized (status[CRAWLJOB_SYNC]) {
2463: status[CRAWLJOB_STATUS] = Boolean.TRUE;
2464: }
2465: setConfig(jobType + "_isPaused", "true");
2466: }
2467:
2468: /**
2469: * Continue the previously paused crawling
2470: */
2471: public void continueCrawlJob(String jobType) {
2472: Object[] status = (Object[]) this .crawlJobsStatus.get(jobType);
2473: synchronized (status[CRAWLJOB_SYNC]) {
2474: if (((Boolean) status[CRAWLJOB_STATUS]).booleanValue()) {
2475: status[CRAWLJOB_STATUS] = Boolean.FALSE;
2476: status[CRAWLJOB_SYNC].notifyAll();
2477: }
2478: }
2479: setConfig(jobType + "_isPaused", "false");
2480: }
2481:
2482: /**
2483: * @return <code>true</code> if crawling was paused or <code>false</code> otherwise
2484: */
2485: public boolean crawlJobIsPaused(String jobType) {
2486: Object[] status = (Object[]) this .crawlJobsStatus.get(jobType);
2487: synchronized (status[CRAWLJOB_SYNC]) {
2488: return ((Boolean) status[CRAWLJOB_STATUS]).booleanValue();
2489: }
2490: }
2491:
2492: private plasmaParserDocument parseResource(
2493: plasmaSwitchboardQueue.Entry entry, String initiatorHash)
2494: throws InterruptedException, ParserException {
2495:
2496: // the mimetype of this entry
2497: String mimeType = entry.getMimeType();
2498: String charset = entry.getCharacterEncoding();
2499:
2500: // the parser logger
2501: //serverLog parserLogger = parser.getLogger();
2502:
2503: // parse the document
2504: return parseResource(entry.url(), mimeType, charset, entry
2505: .cacheFile());
2506: }
2507:
2508: public plasmaParserDocument parseResource(yacyURL location,
2509: String mimeType, String documentCharset, File sourceFile)
2510: throws InterruptedException, ParserException {
2511: plasmaParserDocument doc = parser.parseSource(location,
2512: mimeType, documentCharset, sourceFile);
2513: assert (doc != null) : "Unexpected error. Parser returned null.";
2514: return doc;
2515: }
2516:
2517: private void processResourceStack(plasmaSwitchboardQueue.Entry entry)
2518: throws InterruptedException {
2519: plasmaParserDocument document = null;
2520: try {
2521: // work off one stack entry with a fresh resource
2522: long stackStartTime = 0, stackEndTime = 0, parsingStartTime = 0, parsingEndTime = 0, indexingStartTime = 0, indexingEndTime = 0, storageStartTime = 0, storageEndTime = 0;
2523:
2524: // we must distinguish the following cases: resource-load was initiated by
2525: // 1) global crawling: the index is extern, not here (not possible here)
2526: // 2) result of search queries, some indexes are here (not possible here)
2527: // 3) result of index transfer, some of them are here (not possible here)
2528: // 4) proxy-load (initiator is "------------")
2529: // 5) local prefetch/crawling (initiator is own seedHash)
2530: // 6) local fetching for global crawling (other known or unknwon initiator)
2531: int processCase = PROCESSCASE_0_UNKNOWN;
2532: yacySeed initiatorPeer = null;
2533: String initiatorPeerHash = (entry.proxy()) ? yacyURL.dummyHash
2534: : entry.initiator();
2535: if (initiatorPeerHash.equals(yacyURL.dummyHash)) {
2536: // proxy-load
2537: processCase = PROCESSCASE_4_PROXY_LOAD;
2538: } else if (initiatorPeerHash.equals(yacyCore.seedDB
2539: .mySeed().hash)) {
2540: // normal crawling
2541: processCase = PROCESSCASE_5_LOCAL_CRAWLING;
2542: } else {
2543: // this was done for remote peer (a global crawl)
2544: initiatorPeer = yacyCore.seedDB
2545: .getConnected(initiatorPeerHash);
2546: processCase = PROCESSCASE_6_GLOBAL_CRAWLING;
2547: }
2548:
2549: log.logFine("processResourceStack processCase="
2550: + processCase
2551: + ", depth="
2552: + entry.depth()
2553: + ", maxDepth="
2554: + ((entry.profile() == null) ? "null" : Integer
2555: .toString(entry.profile().generalDepth()))
2556: + ", filter="
2557: + ((entry.profile() == null) ? "null" : entry
2558: .profile().generalFilter())
2559: + ", initiatorHash=" + initiatorPeerHash +
2560: //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
2561: ", url=" + entry.url()); // DEBUG
2562:
2563: /* =========================================================================
2564: * PARSE CONTENT
2565: * ========================================================================= */
2566: parsingStartTime = System.currentTimeMillis();
2567:
2568: try {
2569: document = this .parseResource(entry, initiatorPeerHash);
2570: if (document == null)
2571: return;
2572: } catch (ParserException e) {
2573: this .log.logInfo("Unable to parse the resource '"
2574: + entry.url() + "'. " + e.getMessage());
2575: addURLtoErrorDB(entry.url(), entry.referrerHash(),
2576: initiatorPeerHash, entry.anchorName(), e
2577: .getErrorCode(), new kelondroBitfield());
2578: if (document != null) {
2579: document.close();
2580: document = null;
2581: }
2582: return;
2583: }
2584:
2585: parsingEndTime = System.currentTimeMillis();
2586:
2587: // getting the document date
2588: Date docDate = entry.getModificationDate();
2589:
2590: /* =========================================================================
2591: * put anchors on crawl stack
2592: * ========================================================================= */
2593: stackStartTime = System.currentTimeMillis();
2594: if (((processCase == PROCESSCASE_4_PROXY_LOAD) || (processCase == PROCESSCASE_5_LOCAL_CRAWLING))
2595: && ((entry.profile() == null) || (entry.depth() < entry
2596: .profile().generalDepth()))) {
2597: Map<yacyURL, String> hl = document.getHyperlinks();
2598: Iterator<Map.Entry<yacyURL, String>> i = hl.entrySet()
2599: .iterator();
2600: yacyURL nextUrl;
2601: Map.Entry<yacyURL, String> nextEntry;
2602: while (i.hasNext()) {
2603: // check for interruption
2604: checkInterruption();
2605:
2606: // fetching the next hyperlink
2607: nextEntry = i.next();
2608: nextUrl = nextEntry.getKey();
2609: // enqueue the hyperlink into the pre-notice-url db
2610: crawlStacker
2611: .enqueueEntry(nextUrl, entry.urlHash(),
2612: initiatorPeerHash, nextEntry
2613: .getValue(), docDate, entry
2614: .depth() + 1, entry
2615: .profile());
2616: }
2617: log
2618: .logInfo("CRAWL: ADDED "
2619: + hl.size()
2620: + " LINKS FROM "
2621: + entry.url().toNormalform(false, true)
2622: + ", NEW CRAWL STACK SIZE IS "
2623: + crawlQueues.noticeURL
2624: .stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
2625: }
2626: stackEndTime = System.currentTimeMillis();
2627:
2628: /* =========================================================================
2629: * CREATE INDEX
2630: * ========================================================================= */
2631: String dc_title = document.dc_title();
2632: yacyURL referrerURL = entry.referrerURL();
2633:
2634: String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR;
2635: if (processCase == PROCESSCASE_4_PROXY_LOAD) {
2636: // proxy-load
2637: noIndexReason = entry.shallIndexCacheForProxy();
2638: } else {
2639: // normal crawling
2640: noIndexReason = entry.shallIndexCacheForCrawler();
2641: }
2642:
2643: if (noIndexReason == null) {
2644: // strip out words
2645: indexingStartTime = System.currentTimeMillis();
2646:
2647: checkInterruption();
2648: log.logFine("Condensing for '"
2649: + entry.url().toNormalform(false, true) + "'");
2650: plasmaCondenser condenser = new plasmaCondenser(
2651: document, entry.profile().indexText(), entry
2652: .profile().indexMedia());
2653:
2654: // generate citation reference
2655: Integer[] ioLinks = webStructure
2656: .generateCitationReference(entry.url(), entry
2657: .urlHash(), docDate, document,
2658: condenser); // [outlinksSame, outlinksOther]
2659:
2660: try {
2661: // check for interruption
2662: checkInterruption();
2663:
2664: // create a new loaded URL db entry
2665: long ldate = System.currentTimeMillis();
2666: indexURLEntry newEntry = new indexURLEntry(
2667: entry.url(), // URL
2668: dc_title, // document description
2669: document.dc_creator(), // author
2670: document.dc_subject(' '), // tags
2671: "", // ETag
2672: docDate, // modification date
2673: new Date(), // loaded date
2674: new Date(ldate
2675: + Math.max(0, ldate
2676: - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula
2677: (referrerURL == null) ? null : referrerURL
2678: .hash(), // referer hash
2679: new byte[0], // md5
2680: (int) entry.size(), // size
2681: condenser.RESULT_NUMB_WORDS, // word count
2682: plasmaHTCache.docType(document.dc_format()), // doctype
2683: condenser.RESULT_FLAGS, // flags
2684: yacyURL.language(entry.url()), // language
2685: ioLinks[0].intValue(), // llocal
2686: ioLinks[1].intValue(), // lother
2687: document.getAudiolinks().size(), // laudio
2688: document.getImages().size(), // limage
2689: document.getVideolinks().size(), // lvideo
2690: document.getApplinks().size() // lapp
2691: );
2692: /* ========================================================================
2693: * STORE URL TO LOADED-URL-DB
2694: * ======================================================================== */
2695: wordIndex.loadedURL.store(newEntry);
2696: wordIndex.loadedURL.stack(newEntry, // loaded url db entry
2697: initiatorPeerHash, // initiator peer hash
2698: yacyCore.seedDB.mySeed().hash, // executor peer hash
2699: processCase // process case
2700: );
2701:
2702: // check for interruption
2703: checkInterruption();
2704:
2705: /* ========================================================================
2706: * STORE WORD INDEX
2707: * ======================================================================== */
2708: if (((processCase == PROCESSCASE_4_PROXY_LOAD)
2709: || (processCase == PROCESSCASE_5_LOCAL_CRAWLING) || (processCase == PROCESSCASE_6_GLOBAL_CRAWLING))
2710: && ((entry.profile().indexText()) || (entry
2711: .profile().indexMedia()))) {
2712: String urlHash = newEntry.hash();
2713:
2714: // remove stopwords
2715: log.logInfo("Excluded "
2716: + condenser.excludeWords(stopwords)
2717: + " words in URL " + entry.url());
2718: indexingEndTime = System.currentTimeMillis();
2719:
2720: storageStartTime = System.currentTimeMillis();
2721: int words = 0;
2722: String storagePeerHash;
2723: yacySeed seed;
2724:
2725: if (((storagePeerHash = getConfig(
2726: STORAGE_PEER_HASH, null)) == null)
2727: || (storagePeerHash.trim().length() == 0)
2728: || ((seed = yacyCore.seedDB
2729: .getConnected(storagePeerHash)) == null)) {
2730:
2731: /* ========================================================================
2732: * STORE PAGE INDEX INTO WORD INDEX DB
2733: * ======================================================================== */
2734: words = wordIndex.addPageIndex(entry.url(), // document url
2735: docDate, // document mod date
2736: (int) entry.size(), // document size
2737: document, // document content
2738: condenser, // document condenser
2739: yacyURL.language(entry.url()), // document language
2740: plasmaHTCache.docType(document
2741: .dc_format()),// document type
2742: ioLinks[0].intValue(), // outlinkSame
2743: ioLinks[1].intValue() // outlinkOthers
2744: );
2745: } else {
2746: /* ========================================================================
2747: * SEND PAGE INDEX TO STORAGE PEER
2748: * ======================================================================== */
2749: HashMap<String, indexURLEntry> urlCache = new HashMap<String, indexURLEntry>(
2750: 1);
2751: urlCache.put(newEntry.hash(), newEntry);
2752:
2753: ArrayList<indexContainer> tmpContainers = new ArrayList<indexContainer>(
2754: condenser.words().size());
2755:
2756: String language = yacyURL.language(entry
2757: .url());
2758: char doctype = plasmaHTCache
2759: .docType(document.dc_format());
2760: indexURLEntry.Components comp = newEntry
2761: .comp();
2762: int urlLength = comp.url().toNormalform(
2763: true, true).length();
2764: int urlComps = htmlFilterContentScraper
2765: .urlComps(comp.url().toNormalform(
2766: true, true)).length;
2767:
2768: // iterate over all words
2769: Iterator<Map.Entry<String, wordStatProp>> i = condenser
2770: .words().entrySet().iterator();
2771: Map.Entry<String, wordStatProp> wentry;
2772: plasmaCondenser.wordStatProp wordStat;
2773: while (i.hasNext()) {
2774: wentry = i.next();
2775: String word = wentry.getKey();
2776: wordStat = wentry.getValue();
2777: String wordHash = plasmaCondenser
2778: .word2hash(word);
2779: indexRWIEntry wordIdxEntry = new indexRWIRowEntry(
2780: urlHash, urlLength, urlComps,
2781: wordStat.count, document
2782: .dc_title().length(),
2783: condenser.words().size(),
2784: condenser.sentences().size(),
2785: wordStat.posInText,
2786: wordStat.posInPhrase,
2787: wordStat.numOfPhrase, 0,
2788: newEntry.size(), docDate
2789: .getTime(), System
2790: .currentTimeMillis(),
2791: language, doctype, ioLinks[0]
2792: .intValue(), ioLinks[1]
2793: .intValue(),
2794: condenser.RESULT_FLAGS);
2795: indexContainer wordIdxContainer = plasmaWordIndex
2796: .emptyContainer(wordHash, 1);
2797: wordIdxContainer.add(wordIdxEntry);
2798: tmpContainers.add(wordIdxContainer);
2799: }
2800: //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
2801: words = condenser.words().size();
2802:
2803: // transfering the index to the storage peer
2804: indexContainer[] indexData = (indexContainer[]) tmpContainers
2805: .toArray(new indexContainer[tmpContainers
2806: .size()]);
2807: HashMap<String, Object> resultObj = yacyClient
2808: .transferIndex(seed, // target seed
2809: indexData, // word index data
2810: urlCache, // urls
2811: true, // gzip body
2812: 120000 // transfer timeout
2813: );
2814:
2815: // check for interruption
2816: checkInterruption();
2817:
2818: // if the transfer failed we try to store the index locally
2819: String error = (String) resultObj
2820: .get("result");
2821: if (error != null) {
2822: words = wordIndex.addPageIndex(entry
2823: .url(), docDate, (int) entry
2824: .size(), document, condenser,
2825: yacyURL.language(entry.url()),
2826: plasmaHTCache.docType(document
2827: .dc_format()),
2828: ioLinks[0].intValue(),
2829: ioLinks[1].intValue());
2830: }
2831:
2832: tmpContainers = null;
2833: } //end: SEND PAGE INDEX TO STORAGE PEER
2834:
2835: storageEndTime = System.currentTimeMillis();
2836:
2837: //increment number of indexed urls
2838: indexedPages++;
2839:
2840: if (log.isInfo()) {
2841: // TODO: UTF-8 docDescription seems not to be displayed correctly because
2842: // of string concatenation
2843: log
2844: .logInfo("*Indexed "
2845: + words
2846: + " words in URL "
2847: + entry.url()
2848: + " ["
2849: + entry.urlHash()
2850: + "]"
2851: + "\n\tDescription: "
2852: + dc_title
2853: + "\n\tMimeType: "
2854: + document.dc_format()
2855: + " | Charset: "
2856: + document.getCharset()
2857: + " | "
2858: + "Size: "
2859: + document.getTextLength()
2860: + " bytes | "
2861: + "Anchors: "
2862: + ((document.getAnchors() == null) ? 0
2863: : document
2864: .getAnchors()
2865: .size())
2866: + "\n\tStackingTime: "
2867: + (stackEndTime - stackStartTime)
2868: + " ms | "
2869: + "ParsingTime: "
2870: + (parsingEndTime - parsingStartTime)
2871: + " ms | "
2872: + "IndexingTime: "
2873: + (indexingEndTime - indexingStartTime)
2874: + " ms | "
2875: + "StorageTime: "
2876: + (storageEndTime - storageStartTime)
2877: + " ms");
2878: }
2879:
2880: // update profiling info
2881: plasmaProfiling.updateIndexedPage(entry);
2882:
2883: // check for interruption
2884: checkInterruption();
2885:
2886: // if this was performed for a remote crawl request, notify requester
2887: if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING)
2888: && (initiatorPeer != null)) {
2889: log.logInfo("Sending crawl receipt for '"
2890: + entry.url().toNormalform(false,
2891: true) + "' to "
2892: + initiatorPeer.getName());
2893: if (clusterhashes != null)
2894: initiatorPeer
2895: .setAlternativeAddress((String) clusterhashes
2896: .get(initiatorPeer.hash));
2897: yacyClient.crawlReceipt(initiatorPeer,
2898: "crawl", "fill", "indexed",
2899: newEntry, "");
2900: }
2901: } else {
2902: log.logFine("Not Indexed Resource '"
2903: + entry.url().toNormalform(false, true)
2904: + "': process case=" + processCase);
2905: addURLtoErrorDB(
2906: entry.url(),
2907: referrerURL.hash(),
2908: initiatorPeerHash,
2909: dc_title,
2910: plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE,
2911: new kelondroBitfield());
2912: }
2913: } catch (Exception ee) {
2914: if (ee instanceof InterruptedException)
2915: throw (InterruptedException) ee;
2916:
2917: // check for interruption
2918: checkInterruption();
2919:
2920: log.logSevere("Could not index URL " + entry.url()
2921: + ": " + ee.getMessage(), ee);
2922: if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING)
2923: && (initiatorPeer != null)) {
2924: if (clusterhashes != null)
2925: initiatorPeer
2926: .setAlternativeAddress((String) clusterhashes
2927: .get(initiatorPeer.hash));
2928: yacyClient.crawlReceipt(initiatorPeer, "crawl",
2929: "exception", ee.getMessage(), null, "");
2930: }
2931: addURLtoErrorDB(
2932: entry.url(),
2933: (referrerURL == null) ? null : referrerURL
2934: .hash(),
2935: initiatorPeerHash,
2936: dc_title,
2937: plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR,
2938: new kelondroBitfield());
2939: }
2940:
2941: } else {
2942: // check for interruption
2943: checkInterruption();
2944:
2945: log.logInfo("Not indexed any word in URL "
2946: + entry.url() + "; cause: " + noIndexReason);
2947: addURLtoErrorDB(entry.url(),
2948: (referrerURL == null) ? null : referrerURL
2949: .hash(), initiatorPeerHash, dc_title,
2950: noIndexReason, new kelondroBitfield());
2951: if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING)
2952: && (initiatorPeer != null)) {
2953: if (clusterhashes != null)
2954: initiatorPeer
2955: .setAlternativeAddress((String) clusterhashes
2956: .get(initiatorPeer.hash));
2957: yacyClient.crawlReceipt(initiatorPeer, "crawl",
2958: "rejected", noIndexReason, null, "");
2959: }
2960: }
2961: document.close();
2962: document = null;
2963: } catch (Exception e) {
2964: if (e instanceof InterruptedException)
2965: throw (InterruptedException) e;
2966: this .log.logSevere(
2967: "Unexpected exception while parsing/indexing URL ",
2968: e);
2969: } catch (Error e) {
2970: this .log.logSevere(
2971: "Unexpected exception while parsing/indexing URL ",
2972: e);
2973: } finally {
2974: checkInterruption();
2975:
2976: // The following code must be into the finally block, otherwise it will not be executed
2977: // on errors!
2978:
2979: // removing current entry from in process list
2980: synchronized (this .indexingTasksInProcess) {
2981: this .indexingTasksInProcess.remove(entry.urlHash());
2982: }
2983:
2984: // removing current entry from notice URL queue
2985: /*
2986: boolean removed = noticeURL.remove(entry.urlHash()); // worked-off
2987: if (!removed) {
2988: log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
2989: }
2990: */
2991:
2992: // explicit delete/free resources
2993: if ((entry != null) && (entry.profile() != null)
2994: && (!(entry.profile().storeHTCache()))) {
2995: plasmaHTCache.filesInUse.remove(entry.cacheFile());
2996: //plasmaHTCache.deleteURLfromCache(entry.url());
2997: }
2998: entry = null;
2999:
3000: if (document != null)
3001: try {
3002: document.close();
3003: } catch (Exception e) { /* ignore this */
3004: }
3005: }
3006: }
3007:
3008: private static SimpleDateFormat DateFormatter = new SimpleDateFormat(
3009: "EEE, dd MMM yyyy");
3010:
3011: public static String dateString(Date date) {
3012: if (date == null)
3013: return "";
3014: else
3015: return DateFormatter.format(date);
3016: }
3017:
3018: // we need locale independent RFC-822 dates at some places
3019: private static SimpleDateFormat DateFormatter822 = new SimpleDateFormat(
3020: "EEE, dd MMM yyyy HH:mm:ss Z", Locale.US);
3021:
3022: public static String dateString822(Date date) {
3023: if (date == null)
3024: return "";
3025: else
3026: return DateFormatter822.format(date);
3027: }
3028:
3029: public serverObjects action(String actionName,
3030: serverObjects actionInput) {
3031: // perform an action. (not used)
3032: return null;
3033: }
3034:
3035: public String toString() {
3036: // it is possible to use this method in the cgi pages.
3037: // actually it is used there for testing purpose
3038: return "PROPS: " + super .toString() + "; QUEUE: "
3039: + sbQueue.toString();
3040: }
3041:
3042: // method for index deletion
3043: public int removeAllUrlReferences(yacyURL url, boolean fetchOnline) {
3044: return removeAllUrlReferences(url.hash(), fetchOnline);
3045: }
3046:
3047: public int removeAllUrlReferences(String urlhash,
3048: boolean fetchOnline) {
3049: // find all the words in a specific resource and remove the url reference from every word index
3050: // finally, delete the url entry
3051:
3052: // determine the url string
3053: indexURLEntry entry = wordIndex.loadedURL
3054: .load(urlhash, null, 0);
3055: if (entry == null)
3056: return 0;
3057: indexURLEntry.Components comp = entry.comp();
3058: if (comp.url() == null)
3059: return 0;
3060:
3061: InputStream resourceContent = null;
3062: try {
3063: // get the resource content
3064: Object[] resource = plasmaSnippetCache.getResource(comp
3065: .url(), fetchOnline, 10000, true);
3066: resourceContent = (InputStream) resource[0];
3067: Long resourceContentLength = (Long) resource[1];
3068:
3069: // parse the resource
3070: plasmaParserDocument document = plasmaSnippetCache
3071: .parseDocument(comp.url(), resourceContentLength
3072: .longValue(), resourceContent);
3073:
3074: // get the word set
3075: Set<String> words = null;
3076: try {
3077: words = new plasmaCondenser(document, true, true)
3078: .words().keySet();
3079: } catch (UnsupportedEncodingException e) {
3080: e.printStackTrace();
3081: }
3082:
3083: // delete all word references
3084: int count = 0;
3085: if (words != null)
3086: count = wordIndex.removeWordReferences(words, urlhash);
3087:
3088: // finally delete the url entry itself
3089: wordIndex.loadedURL.remove(urlhash);
3090: return count;
3091: } catch (ParserException e) {
3092: return 0;
3093: } finally {
3094: if (resourceContent != null)
3095: try {
3096: resourceContent.close();
3097: } catch (Exception e) {/* ignore this */
3098: }
3099: }
3100: }
3101:
3102: public int adminAuthenticated(httpHeader header) {
3103:
3104: String adminAccountBase64MD5 = getConfig(
3105: httpd.ADMIN_ACCOUNT_B64MD5, "");
3106: String authorization = ((String) header.get(
3107: httpHeader.AUTHORIZATION, "xxxxxx")).trim()
3108: .substring(6);
3109:
3110: // security check against too long authorization strings
3111: if (authorization.length() > 256)
3112: return 0;
3113:
3114: // authorization by encoded password, only for localhost access
3115: if ((((String) header.get("CLIENTIP", "")).equals("localhost"))
3116: && (adminAccountBase64MD5.equals(authorization)))
3117: return 3; // soft-authenticated for localhost
3118:
3119: // authorization by hit in userDB
3120: if (userDB.hasAdminRight((String) header.get(
3121: httpHeader.AUTHORIZATION, "xxxxxx"), ((String) header
3122: .get("CLIENTIP", "")), header.getHeaderCookies()))
3123: return 4; //return, because 4=max
3124:
3125: // authorization with admin keyword in configuration
3126: return httpd.staticAdminAuthenticated(authorization, this );
3127: }
3128:
3129: public boolean verifyAuthentication(httpHeader header,
3130: boolean strict) {
3131: // handle access rights
3132: switch (adminAuthenticated(header)) {
3133: case 0: // wrong password given
3134: try {
3135: Thread.sleep(3000);
3136: } catch (InterruptedException e) {
3137: } // prevent brute-force
3138: return false;
3139: case 1: // no password given
3140: return false;
3141: case 2: // no password stored
3142: return !strict;
3143: case 3: // soft-authenticated for localhost only
3144: return true;
3145: case 4: // hard-authenticated, all ok
3146: return true;
3147: }
3148: return false;
3149: }
3150:
3151: public void setPerformance(int wantedPPM) {
3152: // we consider 3 cases here
3153: // wantedPPM <= 10: low performance
3154: // 10 < wantedPPM < 1000: custom performance
3155: // 1000 <= wantedPPM : maximum performance
3156: if (wantedPPM <= 10)
3157: wantedPPM = 10;
3158: if (wantedPPM >= 1000)
3159: wantedPPM = 1000;
3160: int newBusySleep = 60000 / wantedPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60
3161:
3162: serverThread thread;
3163:
3164: thread = getThread(INDEX_DIST);
3165: if (thread != null) {
3166: setConfig(INDEX_DIST_BUSYSLEEP, thread.setBusySleep(Math
3167: .max(2000, thread.setBusySleep(newBusySleep * 2))));
3168: thread.setIdleSleep(30000);
3169: }
3170:
3171: thread = getThread(CRAWLJOB_LOCAL_CRAWL);
3172: if (thread != null) {
3173: setConfig(CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, thread
3174: .setBusySleep(newBusySleep));
3175: thread.setIdleSleep(1000);
3176: }
3177:
3178: thread = getThread(PROXY_CACHE_ENQUEUE);
3179: if (thread != null) {
3180: setConfig(PROXY_CACHE_ENQUEUE_BUSYSLEEP, thread
3181: .setBusySleep(0));
3182: thread.setIdleSleep(1000);
3183: }
3184:
3185: thread = getThread(INDEXER);
3186: if (thread != null) {
3187: setConfig(INDEXER_BUSYSLEEP, thread
3188: .setBusySleep(newBusySleep / 2));
3189: thread.setIdleSleep(1000);
3190: }
3191:
3192: }
3193:
3194: public static int accessFrequency(
3195: HashMap<String, TreeSet<Long>> tracker, String host) {
3196: // returns the access frequency in queries per hour for a given host and a specific tracker
3197: long timeInterval = 1000 * 60 * 60;
3198: TreeSet<Long> accessSet = tracker.get(host);
3199: if (accessSet == null)
3200: return 0;
3201: return accessSet.tailSet(
3202: new Long(System.currentTimeMillis() - timeInterval))
3203: .size();
3204: }
3205:
3206: public void startTransferWholeIndex(yacySeed seed, boolean delete) {
3207: if (transferIdxThread == null) {
3208: this .transferIdxThread = new plasmaDHTFlush(this .log,
3209: this .wordIndex, seed, delete,
3210: "true".equalsIgnoreCase(getConfig(
3211: INDEX_TRANSFER_GZIP_BODY, "false")),
3212: (int) getConfigLong(INDEX_TRANSFER_TIMEOUT, 60000));
3213: this .transferIdxThread.start();
3214: }
3215: }
3216:
3217: public void stopTransferWholeIndex(boolean wait) {
3218: if ((transferIdxThread != null)
3219: && (transferIdxThread.isAlive())
3220: && (!transferIdxThread.isFinished())) {
3221: try {
3222: this .transferIdxThread.stopIt(wait);
3223: } catch (InterruptedException e) {
3224: }
3225: }
3226: }
3227:
3228: public void abortTransferWholeIndex(boolean wait) {
3229: if (transferIdxThread != null) {
3230: if (!transferIdxThread.isFinished())
3231: try {
3232: this .transferIdxThread.stopIt(wait);
3233: } catch (InterruptedException e) {
3234: }
3235: transferIdxThread = null;
3236: }
3237: }
3238:
3239: public String dhtShallTransfer() {
3240: if (yacyCore.seedDB == null) {
3241: return "no DHT distribution: seedDB == null";
3242: }
3243: if (yacyCore.seedDB.mySeed() == null) {
3244: return "no DHT distribution: mySeed == null";
3245: }
3246: if (yacyCore.seedDB.mySeed().isVirgin()) {
3247: return "no DHT distribution: status is virgin";
3248: }
3249: if (yacyCore.seedDB.noDHTActivity()) {
3250: return "no DHT distribution: network too small";
3251: }
3252: if (getConfig(INDEX_DIST_ALLOW, "false").equalsIgnoreCase(
3253: "false")) {
3254: return "no DHT distribution: not enabled";
3255: }
3256: if (wordIndex.loadedURL.size() < 10) {
3257: return "no DHT distribution: loadedURL.size() = "
3258: + wordIndex.loadedURL.size();
3259: }
3260: if (wordIndex.size() < 100) {
3261: return "no DHT distribution: not enough words - wordIndex.size() = "
3262: + wordIndex.size();
3263: }
3264: if ((getConfig(INDEX_DIST_ALLOW_WHILE_CRAWLING, "false")
3265: .equalsIgnoreCase("false"))
3266: && (crawlQueues.noticeURL.notEmpty())) {
3267: return "no DHT distribution: crawl in progress: noticeURL.stackSize() = "
3268: + crawlQueues.noticeURL.size()
3269: + ", sbQueue.size() = " + sbQueue.size();
3270: }
3271: if ((getConfig(INDEX_DIST_ALLOW_WHILE_INDEXING, "false")
3272: .equalsIgnoreCase("false"))
3273: && (sbQueue.size() > 1)) {
3274: return "no DHT distribution: indexing in progress: noticeURL.stackSize() = "
3275: + crawlQueues.noticeURL.size()
3276: + ", sbQueue.size() = " + sbQueue.size();
3277: }
3278: return null;
3279: }
3280:
3281: public boolean dhtTransferJob() {
3282: String rejectReason = dhtShallTransfer();
3283: if (rejectReason != null) {
3284: log.logFine(rejectReason);
3285: return false;
3286: }
3287: if (this .dhtTransferChunk == null) {
3288: log
3289: .logFine("no DHT distribution: no transfer chunk defined");
3290: return false;
3291: }
3292: if ((this .dhtTransferChunk != null)
3293: && (this .dhtTransferChunk.getStatus() != plasmaDHTChunk.chunkStatus_FILLED)) {
3294: log
3295: .logFine("no DHT distribution: index distribution is in progress, status="
3296: + this .dhtTransferChunk.getStatus());
3297: return false;
3298: }
3299:
3300: // do the transfer
3301: int peerCount = Math.max(1, (yacyCore.seedDB.mySeed()
3302: .isJunior()) ? (int) getConfigLong(
3303: "network.unit.dhtredundancy.junior", 1)
3304: : (int) getConfigLong(
3305: "network.unit.dhtredundancy.senior", 1)); // set redundancy factor
3306: long starttime = System.currentTimeMillis();
3307:
3308: boolean ok = dhtTransferProcess(dhtTransferChunk, peerCount);
3309:
3310: if (ok) {
3311: dhtTransferChunk
3312: .setStatus(plasmaDHTChunk.chunkStatus_COMPLETE);
3313: log.logFine("DHT distribution: transfer COMPLETE");
3314: // adopt transfer count
3315: if ((System.currentTimeMillis() - starttime) > (10000 * peerCount)) {
3316: dhtTransferIndexCount--;
3317: } else {
3318: if (dhtTransferChunk.indexCount() >= dhtTransferIndexCount)
3319: dhtTransferIndexCount++;
3320: }
3321: int minChunkSize = (int) getConfigLong(
3322: INDEX_DIST_CHUNK_SIZE_MIN, 30);
3323: int maxChunkSize = (int) getConfigLong(
3324: INDEX_DIST_CHUNK_SIZE_MAX, 3000);
3325: if (dhtTransferIndexCount < minChunkSize)
3326: dhtTransferIndexCount = minChunkSize;
3327: if (dhtTransferIndexCount > maxChunkSize)
3328: dhtTransferIndexCount = maxChunkSize;
3329:
3330: // show success
3331: return true;
3332: } else {
3333: dhtTransferChunk.incTransferFailedCounter();
3334: int maxChunkFails = (int) getConfigLong(
3335: INDEX_DIST_CHUNK_FAILS_MAX, 1);
3336: if (dhtTransferChunk.getTransferFailedCounter() >= maxChunkFails) {
3337: //System.out.println("DEBUG: " + dhtTransferChunk.getTransferFailedCounter() + " of " + maxChunkFails + " sendings failed for this chunk, aborting!");
3338: dhtTransferChunk
3339: .setStatus(plasmaDHTChunk.chunkStatus_FAILED);
3340: log.logFine("DHT distribution: transfer FAILED");
3341: } else {
3342: //System.out.println("DEBUG: " + dhtTransferChunk.getTransferFailedCounter() + " of " + maxChunkFails + " sendings failed for this chunk, retrying!");
3343: log
3344: .logFine("DHT distribution: transfer FAILED, sending this chunk again");
3345: }
3346: return false;
3347: }
3348: }
3349:
3350: public boolean dhtTransferProcess(plasmaDHTChunk dhtChunk,
3351: int peerCount) {
3352: if ((yacyCore.seedDB == null)
3353: || (yacyCore.seedDB.sizeConnected() == 0))
3354: return false;
3355:
3356: try {
3357: // find a list of DHT-peers
3358: double maxDist = 0.2;
3359: ArrayList<yacySeed> seeds = yacyCore.dhtAgent
3360: .getDHTTargets(log, peerCount, Math
3361: .min(8, (int) (yacyCore.seedDB
3362: .sizeConnected() * maxDist)),
3363: dhtChunk.firstContainer().getWordHash(),
3364: dhtChunk.lastContainer().getWordHash(),
3365: maxDist);
3366: if (seeds.size() < peerCount) {
3367: log.logWarning("found not enough (" + seeds.size()
3368: + ") peers for distribution for dhtchunk ["
3369: + dhtChunk.firstContainer().getWordHash()
3370: + " .. "
3371: + dhtChunk.lastContainer().getWordHash() + "]");
3372: return false;
3373: }
3374:
3375: // send away the indexes to all these peers
3376: int hc1 = 0;
3377:
3378: // getting distribution configuration values
3379: boolean gzipBody = getConfig(INDEX_DIST_GZIP_BODY, "false")
3380: .equalsIgnoreCase("true");
3381: int timeout = (int) getConfigLong(INDEX_DIST_TIMEOUT, 60000);
3382: int retries = 0;
3383:
3384: // starting up multiple DHT transfer threads
3385: Iterator<yacySeed> seedIter = seeds.iterator();
3386: ArrayList<plasmaDHTTransfer> transfer = new ArrayList<plasmaDHTTransfer>(
3387: peerCount);
3388: while (hc1 < peerCount
3389: && (transfer.size() > 0 || seedIter.hasNext())) {
3390:
3391: // starting up some transfer threads
3392: int transferThreadCount = transfer.size();
3393: for (int i = 0; i < peerCount - hc1
3394: - transferThreadCount; i++) {
3395: // check for interruption
3396: checkInterruption();
3397:
3398: if (seedIter.hasNext()) {
3399: plasmaDHTTransfer t = new plasmaDHTTransfer(
3400: log, (yacySeed) seedIter.next(),
3401: dhtChunk, gzipBody, timeout, retries);
3402: t.start();
3403: transfer.add(t);
3404: } else {
3405: break;
3406: }
3407: }
3408:
3409: // waiting for the transfer threads to finish
3410: Iterator<plasmaDHTTransfer> transferIter = transfer
3411: .iterator();
3412: while (transferIter.hasNext()) {
3413: // check for interruption
3414: checkInterruption();
3415:
3416: plasmaDHTTransfer t = transferIter.next();
3417: if (!t.isAlive()) {
3418: // remove finished thread from the list
3419: transferIter.remove();
3420:
3421: // count successful transfers
3422: if (t.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE) {
3423: this .log
3424: .logInfo("DHT distribution: transfer to peer "
3425: + t.getSeed().getName()
3426: + " finished.");
3427: hc1++;
3428: }
3429: }
3430: }
3431:
3432: if (hc1 < peerCount)
3433: Thread.sleep(100);
3434: }
3435:
3436: // clean up and finish with deletion of indexes
3437: if (hc1 >= peerCount) {
3438: // success
3439: return true;
3440: }
3441: this .log
3442: .logSevere("Index distribution failed. Too few peers ("
3443: + hc1
3444: + ") received the index, not deleted locally.");
3445: return false;
3446: } catch (InterruptedException e) {
3447: return false;
3448: }
3449: }
3450:
3451: private void addURLtoErrorDB(yacyURL url, String referrerHash,
3452: String initiator, String name, String failreason,
3453: kelondroBitfield flags) {
3454: // create a new errorURL DB entry
3455: plasmaCrawlEntry bentry = new plasmaCrawlEntry(initiator, url,
3456: referrerHash, (name == null) ? "" : name, new Date(),
3457: null, 0, 0, 0);
3458: plasmaCrawlZURL.Entry ee = crawlQueues.errorURL.newEntry(
3459: bentry, initiator, new Date(), 0, failreason);
3460: // store the entry
3461: ee.store();
3462: // push it onto the stack
3463: crawlQueues.errorURL.push(ee);
3464: }
3465:
3466: public void checkInterruption() throws InterruptedException {
3467: Thread curThread = Thread.currentThread();
3468: if ((curThread instanceof serverThread)
3469: && ((serverThread) curThread).shutdownInProgress())
3470: throw new InterruptedException("Shutdown in progress ...");
3471: else if (this .terminate || curThread.isInterrupted())
3472: throw new InterruptedException("Shutdown in progress ...");
3473: }
3474:
3475: public void terminate(long delay) {
3476: if (delay <= 0)
3477: throw new IllegalArgumentException(
3478: "The shutdown delay must be greater than 0.");
3479: (new delayedShutdown(this , delay)).start();
3480: }
3481:
3482: public void terminate() {
3483: this .terminate = true;
3484: this .shutdownSync.V();
3485: }
3486:
3487: public boolean isTerminated() {
3488: return this .terminate;
3489: }
3490:
3491: public boolean waitForShutdown() throws InterruptedException {
3492: this .shutdownSync.P();
3493: return this .terminate;
3494: }
3495: }
3496:
3497: class MoreMemory extends TimerTask {
3498: public final void run() {
3499: serverMemory.gc(10000, "MoreMemory()");
3500: }
3501: }
3502:
3503: class delayedShutdown extends Thread {
3504: private plasmaSwitchboard sb;
3505: private long delay;
3506:
3507: public delayedShutdown(plasmaSwitchboard sb, long delay) {
3508: this .sb = sb;
3509: this .delay = delay;
3510: }
3511:
3512: public void run() {
3513: try {
3514: Thread.sleep(delay);
3515: } catch (InterruptedException e) {
3516: sb.getLog().logInfo("interrupted delayed shutdown");
3517: }
3518: this.sb.terminate();
3519: }
3520: }
|