001: // plasmaProtocolLoader.java
002: // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
003: // first published 24.10.2007 on http://yacy.net
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.plasma.crawler;
028:
029: import java.util.Arrays;
030: import java.util.HashSet;
031:
032: import de.anomic.plasma.plasmaCrawlEntry;
033: import de.anomic.plasma.plasmaHTCache;
034: import de.anomic.plasma.plasmaSwitchboard;
035: import de.anomic.server.logging.serverLog;
036:
037: public final class plasmaProtocolLoader {
038:
039: private plasmaSwitchboard sb;
040: private serverLog log;
041: private HashSet<String> supportedProtocols;
042: private plasmaHTTPLoader httpLoader;
043: private plasmaFTPLoader ftpLoader;
044:
045: public plasmaProtocolLoader(plasmaSwitchboard sb, serverLog log) {
046: this .sb = sb;
047: this .log = log;
048: this .supportedProtocols = new HashSet<String>(Arrays
049: .asList(new String[] { "http", "https", "ftp" }));
050:
051: // initiate loader objects
052: httpLoader = new plasmaHTTPLoader(sb, log);
053: ftpLoader = new plasmaFTPLoader(sb, log);
054: }
055:
056: public boolean isSupportedProtocol(String protocol) {
057: if ((protocol == null) || (protocol.length() == 0))
058: return false;
059: return this .supportedProtocols.contains(protocol.trim()
060: .toLowerCase());
061: }
062:
063: @SuppressWarnings("unchecked")
064: public HashSet<String> getSupportedProtocols() {
065: return (HashSet<String>) this .supportedProtocols.clone();
066: }
067:
068: public plasmaHTCache.Entry load(plasmaCrawlEntry entry,
069: String parserMode) {
070: // getting the protocol of the next URL
071: String protocol = entry.url().getProtocol();
072:
073: if ((protocol.equals("http") || (protocol.equals("https"))))
074: return httpLoader.load(entry, parserMode);
075: if (protocol.equals("ftp"))
076: return ftpLoader.load(entry);
077:
078: this .log.logWarning("Unsupported protocol '" + protocol
079: + "' in url " + entry.url());
080: return null;
081: }
082:
083: public String process(plasmaCrawlEntry entry, String parserMode) {
084: // load a resource, store it to htcache and push queue entry to switchboard queue
085: // returns null if everything went fine, a fail reason string if a problem occurred
086: plasmaHTCache.Entry h;
087: try {
088: h = load(entry, parserMode);
089: entry.setStatus("loaded");
090: if (h == null)
091: return "load failed";
092: boolean stored = sb.htEntryStoreProcess(h);
093: entry.setStatus("stored-" + ((stored) ? "ok" : "fail"));
094: return (stored) ? null : "not stored";
095: } catch (Exception e) {
096: log.logWarning("problem loading " + entry.url().toString(),
097: e);
098: return "load error - " + e.getMessage();
099: }
100: }
101:
102: }
|