Source Code Cross Referenced for plasmaHTTPLoader.java in » Search-Engine » yacy » de » anomic » plasma » crawler » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » yacy » de.anomic.plasma.crawler
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        //plasmaCrawlWorker.java 
002:        //------------------------
003:        //part of YaCy
004:        //(C) by Michael Peter Christen; mc@anomic.de
005:        //first published on http://www.anomic.de
006:        //Frankfurt, Germany, 2006
007:        //
008:        // $LastChangedDate: 2006-08-12 16:28:14 +0200 (Sa, 12 Aug 2006) $
009:        // $LastChangedRevision: 2397 $
010:        // $LastChangedBy: theli $
011:        //
012:        //This program is free software; you can redistribute it and/or modify
013:        //it under the terms of the GNU General Public License as published by
014:        //the Free Software Foundation; either version 2 of the License, or
015:        //(at your option) any later version.
016:        //
017:        //This program is distributed in the hope that it will be useful,
018:        //but WITHOUT ANY WARRANTY; without even the implied warranty of
019:        //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
020:        //GNU General Public License for more details.
021:        //
022:        //You should have received a copy of the GNU General Public License
023:        //along with this program; if not, write to the Free Software
024:        //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
025:        //
026:        //Using this software in any meaning (reading, learning, copying, compiling,
027:        //running) means that you agree that the Author(s) is (are) not responsible
028:        //for cost, loss of data or any harm that may be caused directly or indirectly
029:        //by usage of this softare or this documentation. The usage of this software
030:        //is on your own risk. The installation and usage (starting/running) of this
031:        //software may allow other people or application to access your computer and
032:        //any attached devices and is highly dependent on the configuration of the
033:        //software which must be done by the user of the software; the author(s) is
034:        //(are) also not responsible for proper configuration and usage of the
035:        //software, even if provoked by documentation provided together with
036:        //the software.
037:        //
038:        //Any changes to this file according to the GPL as documented in the file
039:        //gpl.txt aside this file in the shipment you received can be done to the
040:        //lines that follows this copyright notice here, but changes must not be
041:        //done inside the copyright notive above. A re-distribution must contain
042:        //the intact and unchanged copyright notice.
043:        //Contributions and changes to the program code must be marked as such.
044:
045:        package de.anomic.plasma.crawler;
046:
047:        import java.io.File;
048:        import java.io.FileOutputStream;
049:        import java.io.IOException;
050:        import java.io.OutputStream;
051:        import java.net.MalformedURLException;
052:        import java.net.NoRouteToHostException;
053:        import java.net.SocketException;
054:        import java.net.UnknownHostException;
055:        import java.util.Date;
056:
057:        import de.anomic.http.httpHeader;
058:        import de.anomic.http.httpRemoteProxyConfig;
059:        import de.anomic.http.httpc;
060:        import de.anomic.http.httpdBoundedSizeOutputStream;
061:        import de.anomic.http.httpdLimitExceededException;
062:        import de.anomic.http.httpdProxyHandler;
063:        import de.anomic.plasma.plasmaCrawlEURL;
064:        import de.anomic.plasma.plasmaCrawlEntry;
065:        import de.anomic.plasma.plasmaHTCache;
066:        import de.anomic.plasma.plasmaParser;
067:        import de.anomic.plasma.plasmaSwitchboard;
068:        import de.anomic.plasma.cache.IResourceInfo;
069:        import de.anomic.plasma.cache.http.ResourceInfo;
070:        import de.anomic.plasma.urlPattern.plasmaURLPattern;
071:        import de.anomic.server.serverSystem;
072:        import de.anomic.server.logging.serverLog;
073:        import de.anomic.yacy.yacyURL;
074:
075:        public final class plasmaHTTPLoader {
076:
077:            public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5;
078:
079:            /**
080:             * The socket timeout that should be used
081:             */
082:            private int socketTimeout;
083:
084:            /**
085:             * The maximum allowed file size
086:             */
087:            private long maxFileSize = -1;
088:
089:            /**
090:             * The remote http proxy that should be used
091:             */
092:            private httpRemoteProxyConfig remoteProxyConfig;
093:
094:            private String acceptEncoding;
095:            private String acceptLanguage;
096:            private String acceptCharset;
097:            private plasmaSwitchboard sb;
098:            private serverLog log;
099:
100:            public plasmaHTTPLoader(plasmaSwitchboard sb, serverLog theLog) {
101:                this .sb = sb;
102:                this .log = theLog;
103:
104:                // refreshing timeout value
105:                this .socketTimeout = (int) sb.getConfigLong(
106:                        "crawler.clientTimeout", 10000);
107:
108:                // maximum allowed file size
109:                this .maxFileSize = sb.getConfigLong("crawler.http.maxFileSize",
110:                        -1);
111:
112:                // some http header values
113:                this .acceptEncoding = sb.getConfig(
114:                        "crawler.http.acceptEncoding", "gzip,deflate");
115:                this .acceptLanguage = sb.getConfig(
116:                        "crawler.http.acceptLanguage", "en-us,en;q=0.5");
117:                this .acceptCharset = sb.getConfig("crawler.http.acceptCharset",
118:                        "ISO-8859-1,utf-8;q=0.7,*;q=0.7");
119:
120:                // getting the http proxy config
121:                this .remoteProxyConfig = sb.remoteProxyConfig;
122:            }
123:
124:            protected plasmaHTCache.Entry createCacheEntry(
125:                    plasmaCrawlEntry entry, Date requestDate,
126:                    httpHeader requestHeader, httpc.response response) {
127:                IResourceInfo resourceInfo = new ResourceInfo(entry.url(),
128:                        requestHeader, response.responseHeader);
129:                return plasmaHTCache.newEntry(requestDate, entry.depth(), entry
130:                        .url(), entry.name(), response.status, resourceInfo,
131:                        entry.initiator(), sb.profilesActiveCrawls
132:                                .getEntry(entry.profileHandle()));
133:            }
134:
135:            public plasmaHTCache.Entry load(plasmaCrawlEntry entry,
136:                    String parserMode) {
137:                return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT);
138:            }
139:
140:            private plasmaHTCache.Entry load(plasmaCrawlEntry entry,
141:                    String parserMode, int retryCount) {
142:
143:                if (retryCount < 0) {
144:                    this .log.logInfo("Redirection counter exceeded for URL "
145:                            + entry.url().toString() + ". Processing aborted.");
146:                    sb.crawlQueues.errorURL
147:                            .newEntry(
148:                                    entry,
149:                                    null,
150:                                    new Date(),
151:                                    1,
152:                                    plasmaCrawlEURL.DENIED_REDIRECTION_COUNTER_EXCEEDED)
153:                            .store();
154:                    return null;
155:                }
156:
157:                Date requestDate = new Date(); // remember the time...
158:                String host = entry.url().getHost();
159:                String path = entry.url().getFile();
160:                int port = entry.url().getPort();
161:                boolean ssl = entry.url().getProtocol().equals("https");
162:                if (port < 0)
163:                    port = (ssl) ? 443 : 80;
164:
165:                // check if url is in blacklist
166:                String hostlow = host.toLowerCase();
167:                if (plasmaSwitchboard.urlBlacklist.isListed(
168:                        plasmaURLPattern.BLACKLIST_CRAWLER, hostlow, path)) {
169:                    this .log.logInfo("CRAWLER Rejecting URL '"
170:                            + entry.url().toString()
171:                            + "'. URL is in blacklist.");
172:                    sb.crawlQueues.errorURL.newEntry(entry, null, new Date(),
173:                            1, plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST).store();
174:                    return null;
175:                }
176:
177:                // take a file from the net
178:                httpc remote = null;
179:                plasmaHTCache.Entry htCache = null;
180:                try {
181:                    // create a request header
182:                    httpHeader requestHeader = new httpHeader();
183:                    requestHeader.put(httpHeader.USER_AGENT,
184:                            httpdProxyHandler.crawlerUserAgent);
185:                    yacyURL refererURL = null;
186:                    if (entry.referrerhash() != null)
187:                        refererURL = sb.getURL(entry.referrerhash());
188:                    if (refererURL != null)
189:                        requestHeader.put(httpHeader.REFERER, refererURL
190:                                .toNormalform(true, true));
191:                    if (this .acceptLanguage != null
192:                            && this .acceptLanguage.length() > 0)
193:                        requestHeader.put(httpHeader.ACCEPT_LANGUAGE,
194:                                this .acceptLanguage);
195:                    if (this .acceptCharset != null
196:                            && this .acceptCharset.length() > 0)
197:                        requestHeader.put(httpHeader.ACCEPT_CHARSET,
198:                                this .acceptCharset);
199:                    if (this .acceptEncoding != null
200:                            && this .acceptEncoding.length() > 0)
201:                        requestHeader.put(httpHeader.ACCEPT_ENCODING,
202:                                this .acceptEncoding);
203:
204:                    // open the connection
205:                    remote = new httpc(host, host, port, this .socketTimeout,
206:                            ssl, this .remoteProxyConfig, "CRAWLER", null);
207:
208:                    // specifying if content encoding is allowed
209:                    remote
210:                            .setAllowContentEncoding((this .acceptEncoding != null && this .acceptEncoding
211:                                    .length() > 0));
212:
213:                    // send request
214:                    httpc.response res = remote.GET(path, requestHeader);
215:
216:                    if (res.status.startsWith("200")
217:                            || res.status.startsWith("203")) {
218:                        // the transfer is ok
219:
220:                        // create a new cache entry
221:                        htCache = createCacheEntry(entry, requestDate,
222:                                requestHeader, res);
223:
224:                        // aborting download if content is to long ...
225:                        if (htCache.cacheFile().getAbsolutePath().length() > serverSystem.maxPathLength) {
226:                            remote.close();
227:                            this .log.logInfo("REJECTED URL "
228:                                    + entry.url().toString()
229:                                    + " because path too long '"
230:                                    + plasmaHTCache.cachePath.getAbsolutePath()
231:                                    + "'");
232:                            sb.crawlQueues.errorURL
233:                                    .newEntry(
234:                                            entry,
235:                                            null,
236:                                            new Date(),
237:                                            1,
238:                                            plasmaCrawlEURL.DENIED_CACHEFILE_PATH_TOO_LONG);
239:                            return (htCache = null);
240:                        }
241:
242:                        // reserve cache entry
243:                        if (!htCache.cacheFile().getCanonicalPath().startsWith(
244:                                plasmaHTCache.cachePath.getCanonicalPath())) {
245:                            // if the response has not the right file type then reject file
246:                            remote.close();
247:                            this .log.logInfo("REJECTED URL "
248:                                    + entry.url().toString()
249:                                    + " because of an invalid file path ('"
250:                                    + htCache.cacheFile().getCanonicalPath()
251:                                    + "' does not start with '"
252:                                    + plasmaHTCache.cachePath.getAbsolutePath()
253:                                    + "').");
254:                            sb.crawlQueues.errorURL
255:                                    .newEntry(
256:                                            entry,
257:                                            null,
258:                                            new Date(),
259:                                            1,
260:                                            plasmaCrawlEURL.DENIED_INVALID_CACHEFILE_PATH);
261:                            return (htCache = null);
262:                        }
263:
264:                        // request has been placed and result has been returned. work off response
265:                        File cacheFile = plasmaHTCache
266:                                .getCachePath(entry.url());
267:                        try {
268:                            if (plasmaParser.supportedContent(parserMode, entry
269:                                    .url(), res.responseHeader.mime())) {
270:                                // delete old content
271:                                if (cacheFile.isFile()) {
272:                                    plasmaHTCache.deleteURLfromCache(entry
273:                                            .url());
274:                                }
275:
276:                                // create parent directories
277:                                cacheFile.getParentFile().mkdirs();
278:
279:                                OutputStream fos = null;
280:                                try {
281:                                    // creating an output stream
282:                                    fos = new FileOutputStream(cacheFile);
283:
284:                                    // getting content length
285:                                    long contentLength = (res.isGzipped()) ? res
286:                                            .getGzippedLength()
287:                                            : res.responseHeader
288:                                                    .contentLength();
289:
290:                                    // check the maximum allowed file size                            
291:                                    if (this .maxFileSize > -1) {
292:                                        if (contentLength == -1) {
293:                                            fos = new httpdBoundedSizeOutputStream(
294:                                                    fos, this .maxFileSize);
295:                                        } else if (contentLength > this .maxFileSize) {
296:                                            remote.close();
297:                                            this .log
298:                                                    .logInfo("REJECTED URL "
299:                                                            + entry.url()
300:                                                            + " because file size '"
301:                                                            + contentLength
302:                                                            + "' exceeds max filesize limit of "
303:                                                            + this .maxFileSize
304:                                                            + " bytes.");
305:                                            sb.crawlQueues.errorURL
306:                                                    .newEntry(
307:                                                            entry,
308:                                                            null,
309:                                                            new Date(),
310:                                                            1,
311:                                                            plasmaCrawlEURL.DENIED_FILESIZE_LIMIT_EXCEEDED);
312:                                            return null;
313:                                        }
314:                                    }
315:
316:                                    // we write the new cache entry to file system directly
317:                                    byte[] cacheArray = null;
318:                                    cacheArray = res.writeContent(fos, false);
319:                                    remote.close();
320:                                    htCache.setCacheArray(cacheArray);
321:                                    plasmaHTCache
322:                                            .writeFileAnnouncement(cacheFile);
323:                                } finally {
324:                                    if (fos != null)
325:                                        try {
326:                                            fos.close();
327:                                        } catch (Exception e) {/* ignore this */
328:                                        }
329:                                    remote.close();
330:                                }
331:
332:                                return htCache;
333:                            } else {
334:                                // if the response has not the right file type then reject file
335:                                remote.close();
336:                                this .log
337:                                        .logInfo("REJECTED WRONG MIME/EXT TYPE "
338:                                                + res.responseHeader.mime()
339:                                                + " for URL "
340:                                                + entry.url().toString());
341:                                sb.crawlQueues.errorURL
342:                                        .newEntry(
343:                                                entry,
344:                                                null,
345:                                                new Date(),
346:                                                1,
347:                                                plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
348:                                return null;
349:                            }
350:                        } catch (SocketException e) {
351:                            // this may happen if the client suddenly closes its connection
352:                            // maybe the user has stopped loading
353:                            // in that case, we are not responsible and just forget it
354:                            // but we clean the cache also, since it may be only partial
355:                            // and most possible corrupted
356:                            if (cacheFile.exists())
357:                                cacheFile.delete();
358:                            this .log
359:                                    .logSevere("CRAWLER LOADER ERROR1: with URL="
360:                                            + entry.url().toString()
361:                                            + ": "
362:                                            + e.toString());
363:                            sb.crawlQueues.errorURL.newEntry(entry, null,
364:                                    new Date(), 1,
365:                                    plasmaCrawlEURL.DENIED_CONNECTION_ERROR);
366:                            htCache = null;
367:                        }
368:                    } else if (res.status.startsWith("30")) {
369:                        if (res.responseHeader.containsKey(httpHeader.LOCATION)) {
370:                            // getting redirection URL
371:                            String redirectionUrlString = (String) res.responseHeader
372:                                    .get(httpHeader.LOCATION);
373:                            redirectionUrlString = redirectionUrlString.trim();
374:
375:                            if (redirectionUrlString.length() == 0) {
376:                                this .log
377:                                        .logWarning("CRAWLER Redirection of URL="
378:                                                + entry.url().toString()
379:                                                + " aborted. Location header is empty.");
380:                                sb.crawlQueues.errorURL
381:                                        .newEntry(
382:                                                entry,
383:                                                null,
384:                                                new Date(),
385:                                                1,
386:                                                plasmaCrawlEURL.DENIED_REDIRECTION_HEADER_EMPTY);
387:                                return null;
388:                            }
389:
390:                            // normalizing URL
391:                            yacyURL redirectionUrl = yacyURL.newURL(
392:                                    entry.url(), redirectionUrlString);
393:
394:                            // restart crawling with new url
395:                            this .log.logInfo("CRAWLER Redirection detected ('"
396:                                    + res.status + "') for URL "
397:                                    + entry.url().toString());
398:                            this .log
399:                                    .logInfo("CRAWLER ..Redirecting request to: "
400:                                            + redirectionUrl);
401:
402:                            // if we are already doing a shutdown we don't need to retry crawling
403:                            if (Thread.currentThread().isInterrupted()) {
404:                                this .log
405:                                        .logSevere("CRAWLER Retry of URL="
406:                                                + entry.url().toString()
407:                                                + " aborted because of server shutdown.");
408:                                sb.crawlQueues.errorURL.newEntry(entry, null,
409:                                        new Date(), 1,
410:                                        plasmaCrawlEURL.DENIED_SERVER_SHUTDOWN);
411:                                return null;
412:                            }
413:
414:                            // generating url hash
415:                            String urlhash = redirectionUrl.hash();
416:
417:                            // check if the url was already indexed
418:                            String dbname = sb.urlExists(urlhash);
419:                            if (dbname != null) {
420:                                this .log
421:                                        .logWarning("CRAWLER Redirection of URL="
422:                                                + entry.url().toString()
423:                                                + " ignored. The url appears already in db "
424:                                                + dbname);
425:                                sb.crawlQueues.errorURL
426:                                        .newEntry(
427:                                                entry,
428:                                                null,
429:                                                new Date(),
430:                                                1,
431:                                                plasmaCrawlEURL.DENIED_REDIRECTION_TO_DOUBLE_CONTENT);
432:                                return null;
433:                            }
434:
435:                            // retry crawling with new url
436:                            entry.redirectURL(redirectionUrl);
437:                            return load(entry,
438:                                    plasmaParser.PARSER_MODE_URLREDIRECTOR,
439:                                    retryCount - 1);
440:
441:                        }
442:                    } else {
443:                        // if the response has not the right response type then reject file
444:                        this .log.logInfo("REJECTED WRONG STATUS TYPE '"
445:                                + res.status + "' for URL "
446:                                + entry.url().toString());
447:
448:                        // not processed any further
449:                        sb.crawlQueues.errorURL.newEntry(entry, null,
450:                                new Date(), 1,
451:                                plasmaCrawlEURL.DENIED_WRONG_HTTP_STATUSCODE
452:                                        + res.statusCode + ")");
453:                    }
454:
455:                    if (remote != null)
456:                        remote.close();
457:                    return htCache;
458:                } catch (Exception e) {
459:                    String errorMsg = e.getMessage();
460:                    String failreason = null;
461:
462:                    if ((e instanceof  IOException) && (errorMsg != null)
463:                            && (errorMsg.indexOf("socket closed") >= 0)
464:                            && (Thread.currentThread().isInterrupted())) {
465:                        this .log
466:                                .logInfo("CRAWLER Interruption detected because of server shutdown.");
467:                        failreason = plasmaCrawlEURL.DENIED_SERVER_SHUTDOWN;
468:                    } else if (e instanceof  httpdLimitExceededException) {
469:                        this .log.logWarning("CRAWLER Max file size limit '"
470:                                + this .maxFileSize
471:                                + "' exceeded while downloading URL "
472:                                + entry.url());
473:                        failreason = plasmaCrawlEURL.DENIED_FILESIZE_LIMIT_EXCEEDED;
474:                    } else if (e instanceof  MalformedURLException) {
475:                        this .log.logWarning("CRAWLER Malformed URL '"
476:                                + entry.url().toString() + "' detected. ");
477:                        failreason = plasmaCrawlEURL.DENIED_MALFORMED_URL;
478:                    } else if (e instanceof  NoRouteToHostException) {
479:                        this .log
480:                                .logWarning("CRAWLER No route to host found while trying to crawl URL  '"
481:                                        + entry.url().toString() + "'.");
482:                        failreason = plasmaCrawlEURL.DENIED_NO_ROUTE_TO_HOST;
483:                    } else if ((e instanceof  UnknownHostException)
484:                            || ((errorMsg != null) && (errorMsg
485:                                    .indexOf("unknown host") >= 0))) {
486:                        yacyURL u = (entry.referrerhash() == null) ? null : sb
487:                                .getURL(entry.referrerhash());
488:                        this .log.logWarning("CRAWLER Unknown host in URL '"
489:                                + entry.url()
490:                                + "'. "
491:                                + "Referer URL: "
492:                                + ((u == null) ? "Unknown" : u.toNormalform(
493:                                        true, true)));
494:                        failreason = plasmaCrawlEURL.DENIED_UNKNOWN_HOST;
495:                    } else if (e instanceof  java.net.BindException) {
496:                        this .log
497:                                .logWarning("CRAWLER BindException detected while trying to download content from '"
498:                                        + entry.url().toString()
499:                                        + "'. Retrying request.");
500:                        failreason = plasmaCrawlEURL.DENIED_CONNECTION_BIND_EXCEPTION;
501:                    } else if ((errorMsg != null)
502:                            && ((errorMsg.indexOf("Corrupt GZIP trailer") >= 0)
503:                                    || (errorMsg.indexOf("Not in GZIP format") >= 0) || (errorMsg
504:                                    .indexOf("Unexpected end of ZLIB") >= 0))) {
505:                        this .log
506:                                .logWarning("CRAWLER Problems detected while receiving gzip encoded content from '"
507:                                        + entry.url().toString()
508:                                        + "'. Retrying request without using gzip content encoding.");
509:                        failreason = plasmaCrawlEURL.DENIED_CONTENT_DECODING_ERROR;
510:                        this .acceptEncoding = null;
511:                    } else if ((errorMsg != null)
512:                            && (errorMsg.indexOf("Read timed out") >= 0)) {
513:                        this .log
514:                                .logWarning("CRAWLER Read timeout while receiving content from '"
515:                                        + entry.url().toString()
516:                                        + "'. Retrying request.");
517:                        failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
518:                    } else if ((errorMsg != null)
519:                            && (errorMsg.indexOf("connect timed out") >= 0)) {
520:                        this .log
521:                                .logWarning("CRAWLER Timeout while trying to connect to '"
522:                                        + entry.url().toString()
523:                                        + "'. Retrying request.");
524:                        failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
525:                    } else if ((errorMsg != null)
526:                            && (errorMsg.indexOf("Connection timed out") >= 0)) {
527:                        this .log
528:                                .logWarning("CRAWLER Connection timeout while receiving content from '"
529:                                        + entry.url().toString()
530:                                        + "'. Retrying request.");
531:                        failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
532:                    } else if ((errorMsg != null)
533:                            && (errorMsg.indexOf("Connection refused") >= 0)) {
534:                        this .log
535:                                .logWarning("CRAWLER Connection refused while trying to connect to '"
536:                                        + entry.url().toString() + "'.");
537:                        failreason = plasmaCrawlEURL.DENIED_CONNECTION_REFUSED;
538:                    } else if ((errorMsg != null)
539:                            && (errorMsg
540:                                    .indexOf("There is not enough space on the disk") >= 0)) {
541:                        this .log
542:                                .logSevere("CRAWLER Not enough space on the disk detected while crawling '"
543:                                        + entry.url().toString()
544:                                        + "'. "
545:                                        + "Pausing crawlers. ");
546:                        sb
547:                                .pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
548:                        sb
549:                                .pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
550:                        failreason = plasmaCrawlEURL.DENIED_OUT_OF_DISK_SPACE;
551:                    } else if ((errorMsg != null)
552:                            && (errorMsg.indexOf("Network is unreachable") >= 0)) {
553:                        this .log
554:                                .logSevere("CRAWLER Network is unreachable while trying to crawl URL '"
555:                                        + entry.url().toString() + "'. ");
556:                        failreason = plasmaCrawlEURL.DENIED_NETWORK_IS_UNREACHABLE;
557:                    } else if ((errorMsg != null)
558:                            && (errorMsg
559:                                    .indexOf("No trusted certificate found") >= 0)) {
560:                        this .log
561:                                .logSevere("CRAWLER No trusted certificate found for URL '"
562:                                        + entry.url().toString() + "'. ");
563:                        failreason = plasmaCrawlEURL.DENIED_SSL_UNTRUSTED_CERT;
564:                    } else {
565:                        this .log.logSevere(
566:                                "CRAWLER Unexpected Error with URL '"
567:                                        + entry.url().toString() + "': "
568:                                        + e.toString(), e);
569:                        failreason = plasmaCrawlEURL.DENIED_CONNECTION_ERROR;
570:                    }
571:
572:                    if (failreason != null) {
573:                        // add url into error db
574:                        sb.crawlQueues.errorURL.newEntry(entry, null,
575:                                new Date(), 1, failreason);
576:                    }
577:                    return null;
578:                }
579:            }
580:
581:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.