Source Code Cross Referenced for plasmaCrawlQueues.java in » Search-Engine » yacy » de » anomic » plasma » crawler » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » yacy » de.anomic.plasma.crawler
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        // plasmaCrawlQueues.java
002:        // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003:        // first published 29.10.2007 on http://yacy.net
004:        //
005:        // This is a part of YaCy, a peer-to-peer based web search engine
006:        //
007:        // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008:        // $LastChangedRevision: 1986 $
009:        // $LastChangedBy: orbiter $
010:        //
011:        // LICENSE
012:        // 
013:        // This program is free software; you can redistribute it and/or modify
014:        // it under the terms of the GNU General Public License as published by
015:        // the Free Software Foundation; either version 2 of the License, or
016:        // (at your option) any later version.
017:        //
018:        // This program is distributed in the hope that it will be useful,
019:        // but WITHOUT ANY WARRANTY; without even the implied warranty of
020:        // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
021:        // GNU General Public License for more details.
022:        //
023:        // You should have received a copy of the GNU General Public License
024:        // along with this program; if not, write to the Free Software
025:        // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026:
027:        package de.anomic.plasma.crawler;
028:
029:        import java.io.File;
030:        import java.io.IOException;
031:        import java.net.MalformedURLException;
032:        import java.text.ParseException;
033:        import java.util.ArrayList;
034:        import java.util.Collections;
035:        import java.util.Date;
036:        import java.util.HashMap;
037:        import java.util.Iterator;
038:        import java.util.Map;
039:
040:        import de.anomic.data.robotsParser;
041:        import de.anomic.plasma.plasmaCrawlEntry;
042:        import de.anomic.plasma.plasmaCrawlNURL;
043:        import de.anomic.plasma.plasmaCrawlProfile;
044:        import de.anomic.plasma.plasmaCrawlZURL;
045:        import de.anomic.plasma.plasmaHTCache;
046:        import de.anomic.plasma.plasmaParser;
047:        import de.anomic.plasma.plasmaSwitchboard;
048:        import de.anomic.server.serverDate;
049:        import de.anomic.server.logging.serverLog;
050:        import de.anomic.xml.rssReader;
051:        import de.anomic.yacy.yacyClient;
052:        import de.anomic.yacy.yacyCore;
053:        import de.anomic.yacy.yacySeed;
054:        import de.anomic.yacy.yacyURL;
055:
056:        public class plasmaCrawlQueues {
057:
058:            private plasmaSwitchboard sb;
059:            private serverLog log;
060:            private Map<Integer, crawlWorker> workers; // mapping from url hash to Worker thread object
061:            private plasmaProtocolLoader loader;
062:            private ArrayList<String> remoteCrawlProviderHashes;
063:
064:            public plasmaCrawlNURL noticeURL;
065:            public plasmaCrawlZURL errorURL, delegatedURL;
066:
067:            public plasmaCrawlQueues(plasmaSwitchboard sb, File plasmaPath) {
068:                this .sb = sb;
069:                this .log = new serverLog("CRAWLER");
070:                this .workers = Collections
071:                        .synchronizedMap(new HashMap<Integer, crawlWorker>());
072:                this .loader = new plasmaProtocolLoader(sb, log);
073:                this .remoteCrawlProviderHashes = new ArrayList<String>();
074:
075:                // start crawling management
076:                log.logConfig("Starting Crawling Management");
077:                noticeURL = new plasmaCrawlNURL(plasmaPath);
078:                //errorURL = new plasmaCrawlZURL(); // fresh error DB each startup; can be hold in RAM and reduces IO;
079:                errorURL = new plasmaCrawlZURL(plasmaPath, "urlError2.db",
080:                        false);
081:                delegatedURL = new plasmaCrawlZURL(plasmaPath,
082:                        "urlDelegated2.db", true);
083:
084:            }
085:
086:            public String urlExists(String hash) {
087:                // tests if hash occurrs in any database
088:                // if it exists, the name of the database is returned,
089:                // if it not exists, null is returned
090:                if (noticeURL.existsInStack(hash))
091:                    return "crawler";
092:                if (delegatedURL.exists(hash))
093:                    return "delegated";
094:                if (errorURL.exists(hash))
095:                    return "errors";
096:                Iterator<crawlWorker> i = workers.values().iterator();
097:                while (i.hasNext())
098:                    if (i.next().entry.url().hash().equals(hash))
099:                        return "worker";
100:                return null;
101:            }
102:
103:            public void urlRemove(String hash) {
104:                noticeURL.removeByURLHash(hash);
105:                delegatedURL.remove(hash);
106:                errorURL.remove(hash);
107:            }
108:
109:            public yacyURL getURL(String urlhash) {
110:                if (urlhash.equals(yacyURL.dummyHash))
111:                    return null;
112:                plasmaCrawlEntry ne = noticeURL.get(urlhash);
113:                if (ne != null)
114:                    return ne.url();
115:                plasmaCrawlZURL.Entry ee = delegatedURL.getEntry(urlhash);
116:                if (ee != null)
117:                    return ee.url();
118:                ee = errorURL.getEntry(urlhash);
119:                if (ee != null)
120:                    return ee.url();
121:                Iterator<crawlWorker> i = workers.values().iterator();
122:                crawlWorker w;
123:                while (i.hasNext()) {
124:                    w = i.next();
125:                    if (w.entry.url().hash().equals(urlhash))
126:                        return w.entry.url();
127:                }
128:                return null;
129:            }
130:
131:            public void close() {
132:                // wait for all workers to finish
133:                Iterator<crawlWorker> i = workers.values().iterator();
134:                while (i.hasNext())
135:                    i.next().interrupt();
136:                // TODO: wait some more time until all threads are finished
137:                noticeURL.close();
138:                errorURL.close();
139:                delegatedURL.close();
140:            }
141:
142:            public plasmaCrawlEntry[] activeWorker() {
143:                synchronized (workers) {
144:                    plasmaCrawlEntry[] w = new plasmaCrawlEntry[workers.size()];
145:                    int i = 0;
146:                    Iterator<crawlWorker> j = workers.values().iterator();
147:                    while (j.hasNext()) {
148:                        w[i++] = j.next().entry;
149:                    }
150:                    return w;
151:                }
152:            }
153:
154:            public boolean isSupportedProtocol(String protocol) {
155:                return loader.isSupportedProtocol(protocol);
156:            }
157:
158:            public int coreCrawlJobSize() {
159:                return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
160:            }
161:
162:            public boolean coreCrawlJob() {
163:
164:                boolean robinsonPrivateCase = ((sb.isRobinsonMode())
165:                        && (!sb
166:                                .getConfig(plasmaSwitchboard.CLUSTER_MODE, "")
167:                                .equals(
168:                                        plasmaSwitchboard.CLUSTER_MODE_PUBLIC_CLUSTER)) && (!sb
169:                        .getConfig(plasmaSwitchboard.CLUSTER_MODE, "").equals(
170:                                plasmaSwitchboard.CLUSTER_MODE_PRIVATE_CLUSTER)));
171:
172:                if (((robinsonPrivateCase) || (coreCrawlJobSize() <= 20))
173:                        && (limitCrawlJobSize() > 0)) {
174:                    // move some tasks to the core crawl job so we have something to do
175:                    int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
176:                    for (int i = 0; i < toshift; i++) {
177:                        noticeURL.shift(plasmaCrawlNURL.STACK_TYPE_LIMIT,
178:                                plasmaCrawlNURL.STACK_TYPE_CORE);
179:                    }
180:                    log
181:                            .logInfo("shifted "
182:                                    + toshift
183:                                    + " jobs from global crawl to local crawl (coreCrawlJobSize()="
184:                                    + coreCrawlJobSize()
185:                                    + ", limitCrawlJobSize()="
186:                                    + limitCrawlJobSize()
187:                                    + ", cluster.mode="
188:                                    + sb.getConfig(
189:                                            plasmaSwitchboard.CLUSTER_MODE, "")
190:                                    + ", robinsonMode="
191:                                    + ((sb.isRobinsonMode()) ? "on" : "off"));
192:                }
193:
194:                if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
195:                    //log.logDebug("CoreCrawl: queue is empty");
196:                    return false;
197:                }
198:                if (sb.sbQueue.size() >= (int) sb.getConfigLong(
199:                        plasmaSwitchboard.INDEXER_SLOTS, 30)) {
200:                    log
201:                            .logFine("CoreCrawl: too many processes in indexing queue, dismissed ("
202:                                    + "sbQueueSize=" + sb.sbQueue.size() + ")");
203:                    return false;
204:                }
205:                if (this .size() >= sb.getConfigLong(
206:                        plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
207:                    log
208:                            .logFine("CoreCrawl: too many processes in loader queue, dismissed ("
209:                                    + "cacheLoader=" + this .size() + ")");
210:                    return false;
211:                }
212:                if (sb.onlineCaution()) {
213:                    log
214:                            .logFine("CoreCrawl: online caution, omitting processing");
215:                    return false;
216:                }
217:                // if the server is busy, we do crawling more slowly
218:                //if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
219:
220:                // if crawling was paused we have to wait until we wer notified to continue
221:                Object[] status = (Object[]) sb.crawlJobsStatus
222:                        .get(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
223:                synchronized (status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
224:                    if (((Boolean) status[plasmaSwitchboard.CRAWLJOB_STATUS])
225:                            .booleanValue()) {
226:                        try {
227:                            status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
228:                        } catch (InterruptedException e) {
229:                            return false;
230:                        }
231:                    }
232:                }
233:
234:                // do a local crawl        
235:                plasmaCrawlEntry urlEntry = null;
236:                while (urlEntry == null
237:                        && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
238:                    String stats = "LOCALCRAWL["
239:                            + noticeURL
240:                                    .stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)
241:                            + ", "
242:                            + noticeURL
243:                                    .stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT)
244:                            + ", "
245:                            + noticeURL
246:                                    .stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG)
247:                            + ", "
248:                            + noticeURL
249:                                    .stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)
250:                            + "]";
251:                    try {
252:                        urlEntry = noticeURL.pop(
253:                                plasmaCrawlNURL.STACK_TYPE_CORE, true);
254:                        String profileHandle = urlEntry.profileHandle();
255:                        // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
256:                        // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
257:                        if (profileHandle == null) {
258:                            log.logSevere(stats + ": NULL PROFILE HANDLE '"
259:                                    + urlEntry.profileHandle() + "' for URL "
260:                                    + urlEntry.url());
261:                            return true;
262:                        }
263:                        plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls
264:                                .getEntry(profileHandle);
265:                        if (profile == null) {
266:                            log.logWarning(stats + ": LOST PROFILE HANDLE '"
267:                                    + urlEntry.profileHandle() + "' for URL "
268:                                    + urlEntry.url());
269:                            return true;
270:                        }
271:
272:                        // check if the protocol is supported
273:                        yacyURL url = urlEntry.url();
274:                        String urlProtocol = url.getProtocol();
275:                        if (!this .sb.crawlQueues
276:                                .isSupportedProtocol(urlProtocol)) {
277:                            this .log.logSevere("Unsupported protocol in URL '"
278:                                    + url.toString());
279:                            return true;
280:                        }
281:
282:                        log
283:                                .logFine("LOCALCRAWL: URL="
284:                                        + urlEntry.url()
285:                                        + ", initiator="
286:                                        + urlEntry.initiator()
287:                                        + ", crawlOrder="
288:                                        + ((profile.remoteIndexing()) ? "true"
289:                                                : "false")
290:                                        + ", depth="
291:                                        + urlEntry.depth()
292:                                        + ", crawlDepth="
293:                                        + profile.generalDepth()
294:                                        + ", filter="
295:                                        + profile.generalFilter()
296:                                        + ", permission="
297:                                        + ((yacyCore.seedDB == null) ? "undefined"
298:                                                : (((yacyCore.seedDB.mySeed()
299:                                                        .isSenior()) || (yacyCore.seedDB
300:                                                        .mySeed().isPrincipal())) ? "true"
301:                                                        : "false")));
302:
303:                        processLocalCrawling(urlEntry, stats);
304:                        return true;
305:                    } catch (IOException e) {
306:                        log.logSevere(stats + ": CANNOT FETCH ENTRY: "
307:                                + e.getMessage(), e);
308:                        if (e.getMessage().indexOf("hash is null") > 0)
309:                            noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
310:                    }
311:                }
312:                return true;
313:            }
314:
315:            public boolean remoteCrawlLoaderJob() {
316:                // check if we are allowed to crawl urls provided by other peers
317:                if (!yacyCore.seedDB.mySeed().getFlagAcceptRemoteCrawl()) {
318:                    //this.log.logInfo("remoteCrawlLoaderJob: not done, we are not allowed to do that");
319:                    return false;
320:                }
321:
322:                // check if we are a senior peer
323:                if (!yacyCore.seedDB.mySeed().isActive()) {
324:                    //this.log.logInfo("remoteCrawlLoaderJob: not done, this should be a senior or principal peer");
325:                    return false;
326:                }
327:
328:                if (sb.sbQueue.size() >= (int) sb.getConfigLong(
329:                        plasmaSwitchboard.INDEXER_SLOTS, 30)) {
330:                    log
331:                            .logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed ("
332:                                    + "sbQueueSize=" + sb.sbQueue.size() + ")");
333:                    return false;
334:                }
335:
336:                if (this .size() >= sb.getConfigLong(
337:                        plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
338:                    log
339:                            .logFine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed ("
340:                                    + "cacheLoader=" + this .size() + ")");
341:                    return false;
342:                }
343:
344:                if (sb.onlineCaution()) {
345:                    log
346:                            .logFine("remoteCrawlLoaderJob: online caution, omitting processing");
347:                    return false;
348:                }
349:
350:                // check if we have an entry in the provider list, otherwise fill the list
351:                yacySeed seed;
352:                if ((remoteCrawlProviderHashes.size() == 0)
353:                        && (coreCrawlJobSize() == 0)
354:                        && (remoteTriggeredCrawlJobSize() == 0)
355:                        && (sb.queueSize() < 10)) {
356:                    if (yacyCore.seedDB != null
357:                            && yacyCore.seedDB.sizeConnected() > 0) {
358:                        Iterator<yacySeed> e = yacyCore.dhtAgent
359:                                .getProvidesRemoteCrawlURLs();
360:                        while (e.hasNext()) {
361:                            seed = e.next();
362:                            if (seed != null) {
363:                                remoteCrawlProviderHashes.add(seed.hash);
364:
365:                            }
366:                        }
367:                    }
368:                }
369:                if (remoteCrawlProviderHashes.size() == 0)
370:                    return false;
371:
372:                // take one entry from the provider list and load the entries from the remote peer
373:                seed = null;
374:                String hash = null;
375:                while ((seed == null) && (remoteCrawlProviderHashes.size() > 0)) {
376:                    hash = (String) remoteCrawlProviderHashes
377:                            .remove(remoteCrawlProviderHashes.size() - 1);
378:                    if (hash == null)
379:                        continue;
380:                    seed = yacyCore.seedDB.get(hash);
381:                    if (seed == null)
382:                        continue;
383:                    // check if the peer is inside our cluster
384:                    if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(seed))) {
385:                        seed = null;
386:                        continue;
387:                    }
388:                }
389:                if (seed == null)
390:                    return false;
391:
392:                // we know a peer which should provide remote crawl entries. load them now.
393:                rssReader reader = (seed == null) ? null : yacyClient
394:                        .queryRemoteCrawlURLs(seed, 10);
395:                if (reader == null)
396:                    return true;
397:                // parse the rss
398:                rssReader.Item item;
399:                yacyURL url, referrer;
400:                Date loaddate;
401:                for (int i = 0; i < reader.items(); i++) {
402:                    item = reader.getItem(i);
403:                    //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
404:
405:                    // put url on remote crawl stack
406:                    try {
407:                        url = new yacyURL(item.getLink(), null);
408:                    } catch (MalformedURLException e) {
409:                        url = null;
410:                    }
411:                    try {
412:                        referrer = new yacyURL(item.getReferrer(), null);
413:                    } catch (MalformedURLException e) {
414:                        referrer = null;
415:                    }
416:                    try {
417:                        loaddate = serverDate.parseShortSecond(item
418:                                .getPubDate());
419:                    } catch (ParseException e) {
420:                        loaddate = new Date();
421:                    }
422:                    if (sb.acceptURL(url)) {
423:                        // stack url
424:                        sb.getLog().logFinest(
425:                                "crawlOrder: stack: url='" + url + "'");
426:                        String reasonString = sb.crawlStacker.stackCrawl(url,
427:                                referrer, hash, item.getDescription(),
428:                                loaddate, 0, sb.defaultRemoteProfile);
429:
430:                        if (reasonString == null) {
431:                            // done
432:                            log.logInfo("crawlOrder: added remote crawl url: "
433:                                    + url.toNormalform(true, false));
434:                        } else if (reasonString.startsWith("double")) {
435:                            // case where we have already the url loaded;
436:                            log
437:                                    .logInfo("crawlOrder: ignored double remote crawl url: "
438:                                            + url.toNormalform(true, false));
439:                        } else {
440:                            log.logInfo("crawlOrder: ignored [" + reasonString
441:                                    + "] remote crawl url: "
442:                                    + url.toNormalform(true, false));
443:                        }
444:                    } else {
445:                        log
446:                                .logWarning("crawlOrder: Received URL outside of our domain: "
447:                                        + url.toNormalform(true, false));
448:                    }
449:                }
450:                return true;
451:            }
452:
453:            public int limitCrawlJobSize() {
454:                return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT);
455:            }
456:
457:            public int remoteTriggeredCrawlJobSize() {
458:                return noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
459:            }
460:
461:            public boolean remoteTriggeredCrawlJob() {
462:                // work off crawl requests that had been placed by other peers to our crawl stack
463:
464:                // do nothing if either there are private processes to be done
465:                // or there is no global crawl on the stack
466:                if (noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) == 0) {
467:                    //log.logDebug("GlobalCrawl: queue is empty");
468:                    return false;
469:                }
470:                if (sb.sbQueue.size() >= (int) sb.getConfigLong(
471:                        plasmaSwitchboard.INDEXER_SLOTS, 30)) {
472:                    log
473:                            .logFine("GlobalCrawl: too many processes in indexing queue, dismissed ("
474:                                    + "sbQueueSize=" + sb.sbQueue.size() + ")");
475:                    return false;
476:                }
477:                if (this .size() >= sb.getConfigLong(
478:                        plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
479:                    log
480:                            .logFine("GlobalCrawl: too many processes in loader queue, dismissed ("
481:                                    + "cacheLoader=" + this .size() + ")");
482:                    return false;
483:                }
484:                if (sb.onlineCaution()) {
485:                    log
486:                            .logFine("GlobalCrawl: online caution, omitting processing");
487:                    return false;
488:                }
489:
490:                // if crawling was paused we have to wait until we wer notified to continue
491:                Object[] status = (Object[]) sb.crawlJobsStatus
492:                        .get(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
493:                synchronized (status[plasmaSwitchboard.CRAWLJOB_SYNC]) {
494:                    if (((Boolean) status[plasmaSwitchboard.CRAWLJOB_STATUS])
495:                            .booleanValue()) {
496:                        try {
497:                            status[plasmaSwitchboard.CRAWLJOB_SYNC].wait();
498:                        } catch (InterruptedException e) {
499:                            return false;
500:                        }
501:                    }
502:                }
503:
504:                // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
505:                String stats = "REMOTETRIGGEREDCRAWL["
506:                        + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)
507:                        + ", "
508:                        + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT)
509:                        + ", "
510:                        + noticeURL
511:                                .stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG)
512:                        + ", "
513:                        + noticeURL
514:                                .stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)
515:                        + "]";
516:                try {
517:                    plasmaCrawlEntry urlEntry = noticeURL.pop(
518:                            plasmaCrawlNURL.STACK_TYPE_REMOTE, true);
519:                    String profileHandle = urlEntry.profileHandle();
520:                    // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
521:                    // profileHandle = " + profileHandle + ", urlEntry.url = " +
522:                    // urlEntry.url());
523:                    plasmaCrawlProfile.entry profile = sb.profilesActiveCrawls
524:                            .getEntry(profileHandle);
525:
526:                    if (profile == null) {
527:                        log.logWarning(stats + ": LOST PROFILE HANDLE '"
528:                                + urlEntry.profileHandle() + "' for URL "
529:                                + urlEntry.url());
530:                        return false;
531:                    }
532:
533:                    // check if the protocol is supported
534:                    yacyURL url = urlEntry.url();
535:                    String urlProtocol = url.getProtocol();
536:                    if (!this .sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
537:                        this .log.logSevere("Unsupported protocol in URL '"
538:                                + url.toString());
539:                        return true;
540:                    }
541:
542:                    log
543:                            .logFine("plasmaSwitchboard.remoteTriggeredCrawlJob: url="
544:                                    + urlEntry.url()
545:                                    + ", initiator="
546:                                    + urlEntry.initiator()
547:                                    + ", crawlOrder="
548:                                    + ((profile.remoteIndexing()) ? "true"
549:                                            : "false")
550:                                    + ", depth="
551:                                    + urlEntry.depth()
552:                                    + ", crawlDepth="
553:                                    + profile.generalDepth()
554:                                    + ", filter="
555:                                    + profile.generalFilter()
556:                                    + ", permission="
557:                                    + ((yacyCore.seedDB == null) ? "undefined"
558:                                            : (((yacyCore.seedDB.mySeed()
559:                                                    .isSenior()) || (yacyCore.seedDB
560:                                                    .mySeed().isPrincipal())) ? "true"
561:                                                    : "false")));
562:
563:                    processLocalCrawling(urlEntry, stats);
564:                    return true;
565:                } catch (IOException e) {
566:                    log.logSevere(stats + ": CANNOT FETCH ENTRY: "
567:                            + e.getMessage(), e);
568:                    if (e.getMessage().indexOf("hash is null") > 0)
569:                        noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_REMOTE);
570:                    return true;
571:                }
572:            }
573:
574:            private void processLocalCrawling(plasmaCrawlEntry entry,
575:                    String stats) {
576:                // work off one Crawl stack entry
577:                if ((entry == null) || (entry.url() == null)) {
578:                    log.logInfo(stats + ": urlEntry = null");
579:                    return;
580:                }
581:                new crawlWorker(entry);
582:
583:                log.logInfo(stats + ": enqueued for load " + entry.url() + " ["
584:                        + entry.url().hash() + "]");
585:                return;
586:            }
587:
588:            public plasmaHTCache.Entry loadResourceFromWeb(yacyURL url,
589:                    int socketTimeout, boolean keepInMemory, boolean forText) {
590:
591:                plasmaCrawlEntry centry = new plasmaCrawlEntry(yacyCore.seedDB
592:                        .mySeed().hash, url, null, "", new Date(),
593:                        (forText) ? sb.defaultTextSnippetProfile.handle()
594:                                : sb.defaultMediaSnippetProfile.handle(), // crawl profile
595:                        0, 0, 0);
596:
597:                return loader.load(centry,
598:                        (forText) ? plasmaParser.PARSER_MODE_CRAWLER
599:                                : plasmaParser.PARSER_MODE_IMAGE);
600:            }
601:
602:            public int size() {
603:                return workers.size();
604:            }
605:
606:            protected class crawlWorker extends Thread {
607:
608:                public plasmaCrawlEntry entry;
609:                private Integer code;
610:
611:                public crawlWorker(plasmaCrawlEntry entry) {
612:                    this .entry = entry;
613:                    this .entry.setStatus("worker-initialized");
614:                    this .code = new Integer(entry.hashCode());
615:                    if (!workers.containsKey(code)) {
616:                        workers.put(code, this );
617:                        this .start();
618:                    }
619:                }
620:
621:                public void run() {
622:                    try {
623:                        // checking robots.txt for http(s) resources
624:                        this .entry.setStatus("worker-checkingrobots");
625:                        if ((entry.url().getProtocol().equals("http") || entry
626:                                .url().getProtocol().equals("https"))
627:                                && robotsParser.isDisallowed(entry.url())) {
628:                            log.logFine("Crawling of URL '"
629:                                    + entry.url().toString()
630:                                    + "' disallowed by robots.txt.");
631:                            plasmaCrawlZURL.Entry eentry = errorURL.newEntry(
632:                                    this .entry.url(), "denied by robots.txt");
633:                            eentry.store();
634:                            errorURL.push(eentry);
635:                        } else {
636:                            // starting a load from the internet
637:                            this .entry.setStatus("worker-loading");
638:                            String result = loader.process(this .entry,
639:                                    plasmaParser.PARSER_MODE_CRAWLER);
640:                            if (result != null) {
641:                                plasmaCrawlZURL.Entry eentry = errorURL
642:                                        .newEntry(this .entry.url(),
643:                                                "cannot load: " + result);
644:                                eentry.store();
645:                                errorURL.push(eentry);
646:                            } else {
647:                                this .entry.setStatus("worker-processed");
648:                            }
649:                        }
650:                    } catch (Exception e) {
651:                        plasmaCrawlZURL.Entry eentry = errorURL.newEntry(
652:                                this .entry.url(), e.getMessage()
653:                                        + " - in worker");
654:                        eentry.store();
655:                        errorURL.push(eentry);
656:                        e.printStackTrace();
657:                    } finally {
658:                        workers.remove(code);
659:                        this .entry.setStatus("worker-finalized");
660:                    }
661:                }
662:
663:            }
664:
665:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.