001: // CrawlerWorker.java
002: // -------------------------------------
003: // part of YACY
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2006
007: //
008: // This file ist contributed by Martin Thelian
009: //
010: // $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
011: // $LastChangedRevision: 1715 $
012: // $LastChangedBy: theli $
013: //
014: // This program is free software; you can redistribute it and/or modify
015: // it under the terms of the GNU General Public License as published by
016: // the Free Software Foundation; either version 2 of the License, or
017: // (at your option) any later version.
018: //
019: // This program is distributed in the hope that it will be useful,
020: // but WITHOUT ANY WARRANTY; without even the implied warranty of
021: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
022: // GNU General Public License for more details.
023: //
024: // You should have received a copy of the GNU General Public License
025: // along with this program; if not, write to the Free Software
026: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
027: //
028: // Using this software in any meaning (reading, learning, copying, compiling,
029: // running) means that you agree that the Author(s) is (are) not responsible
030: // for cost, loss of data or any harm that may be caused directly or indirectly
031: // by usage of this softare or this documentation. The usage of this software
032: // is on your own risk. The installation and usage (starting/running) of this
033: // software may allow other people or application to access your computer and
034: // any attached devices and is highly dependent on the configuration of the
035: // software which must be done by the user of the software; the author(s) is
036: // (are) also not responsible for proper configuration and usage of the
037: // software, even if provoked by documentation provided together with
038: // the software.
039: //
040: // Any changes to this file according to the GPL as documented in the file
041: // gpl.txt aside this file in the shipment you received can be done to the
042: // lines that follows this copyright notice here, but changes must not be
043: // done inside the copyright notive above. A re-distribution must contain
044: // the intact and unchanged copyright notice.
045: // Contributions and changes to the program code must be marked as such.
046:
047: package de.anomic.plasma.crawler;
048:
049: import java.io.ByteArrayOutputStream;
050: import java.io.File;
051: import java.io.FileOutputStream;
052: import java.io.PrintStream;
053: import java.io.PrintWriter;
054: import java.util.Date;
055:
056: import de.anomic.net.ftpc;
057: import de.anomic.plasma.plasmaCrawlEURL;
058: import de.anomic.plasma.plasmaCrawlEntry;
059: import de.anomic.plasma.plasmaHTCache;
060: import de.anomic.plasma.plasmaParser;
061: import de.anomic.plasma.plasmaSwitchboard;
062: import de.anomic.plasma.cache.ftp.ResourceInfo;
063: import de.anomic.server.logging.serverLog;
064:
065: public class plasmaFTPLoader {
066:
067: private plasmaSwitchboard sb;
068: private serverLog log;
069:
070: public plasmaFTPLoader(plasmaSwitchboard sb, serverLog log) {
071: this .sb = sb;
072: this .log = log;
073: }
074:
075: protected plasmaHTCache.Entry createCacheEntry(
076: plasmaCrawlEntry entry, String mimeType, Date fileDate) {
077: return plasmaHTCache.newEntry(new Date(), entry.depth(), entry
078: .url(), entry.name(), "OK", new ResourceInfo(entry
079: .url(), sb.getURL(entry.referrerhash()), mimeType,
080: fileDate), entry.initiator(), sb.profilesActiveCrawls
081: .getEntry(entry.profileHandle()));
082: }
083:
084: public plasmaHTCache.Entry load(plasmaCrawlEntry entry) {
085:
086: ByteArrayOutputStream bout = new ByteArrayOutputStream();
087: PrintStream out = new PrintStream(bout);
088:
089: ByteArrayOutputStream berr = new ByteArrayOutputStream();
090: PrintStream err = new PrintStream(berr);
091:
092: // create a new ftp client
093: ftpc ftpClient = new ftpc(System.in, out, err);
094:
095: // get username and password
096: String userInfo = entry.url().getUserInfo();
097: String userName = "anonymous", userPwd = "anonymous";
098: if (userInfo != null) {
099: int pos = userInfo.indexOf(":");
100: if (pos != -1) {
101: userName = userInfo.substring(0, pos);
102: userPwd = userInfo.substring(pos + 1);
103: }
104: }
105:
106: // get server name, port and file path
107: String host = entry.url().getHost();
108: String fullPath = entry.url().getPath();
109: int port = entry.url().getPort();
110:
111: plasmaHTCache.Entry htCache = null;
112: try {
113: // open a connection to the ftp server
114: if (port == -1) {
115: ftpClient.exec("open " + host, false);
116: } else {
117: ftpClient.exec("open " + host + " " + port, false);
118: }
119:
120: // login to the server
121: ftpClient.exec("user " + userName + " " + userPwd, false);
122:
123: // change transfer mode to binary
124: ftpClient.exec("binary", false);
125:
126: // determine filename and path
127: String file, path;
128: if (fullPath.endsWith("/")) {
129: file = "";
130: path = fullPath;
131: } else {
132: int pos = fullPath.lastIndexOf("/");
133: if (pos == -1) {
134: file = fullPath;
135: path = "/";
136: } else {
137: path = fullPath.substring(0, pos + 1);
138: file = fullPath.substring(pos + 1);
139: }
140: }
141:
142: // testing if the specified file is a directory
143: if (file.length() > 0) {
144: ftpClient.exec("cd \"" + path + "\"", false);
145:
146: // testing if the current name is a directoy
147: boolean isFolder = ftpClient.isFolder(file);
148: if (isFolder) {
149: fullPath = fullPath + "/";
150: file = "";
151: }
152: }
153:
154: // creating a cache file object
155: File cacheFile = plasmaHTCache.getCachePath(entry.url());
156:
157: // TODO: aborting download if content is to long ...
158:
159: // TODO: invalid file path check
160:
161: // testing if the file already exists
162: if (cacheFile.isFile()) {
163: // delete the file if it already exists
164: plasmaHTCache.deleteURLfromCache(entry.url());
165: } else {
166: // create parent directories
167: cacheFile.getParentFile().mkdirs();
168: }
169:
170: String mimeType;
171: Date fileDate;
172: if (file.length() == 0) {
173: // getting the dirlist
174: mimeType = "text/html";
175: fileDate = new Date();
176:
177: // create a htcache entry
178: htCache = createCacheEntry(entry, mimeType, fileDate);
179:
180: // generate the dirlist
181: StringBuffer dirList = ftpClient.dirhtml(fullPath);
182:
183: if (dirList != null && dirList.length() > 0)
184: try {
185: // write it into a file
186: PrintWriter writer = new PrintWriter(
187: new FileOutputStream(cacheFile), false);
188: writer.write(dirList.toString());
189: writer.flush();
190: writer.close();
191: } catch (Exception e) {
192: this .log
193: .logInfo("Unable to write dirlist for URL "
194: + entry.url().toString());
195: htCache = null;
196: }
197: } else {
198: // determine the mimetype of the resource
199: String extension = plasmaParser.getFileExt(entry.url());
200: mimeType = plasmaParser.getMimeTypeByFileExt(extension);
201:
202: // if the mimetype and file extension is supported we start to download the file
203: if (plasmaParser.supportedContent(
204: plasmaParser.PARSER_MODE_CRAWLER, entry.url(),
205: mimeType)) {
206:
207: // TODO: determine the real file date
208: fileDate = new Date();
209:
210: // create a htcache entry
211: htCache = createCacheEntry(entry, mimeType,
212: fileDate);
213:
214: // change into working directory
215: ftpClient.exec("cd \"" + fullPath + "\"", false);
216:
217: // download the remote file
218: ftpClient
219: .exec("get \"" + file + "\" \""
220: + cacheFile.getAbsolutePath()
221: + "\"", false);
222: } else {
223: // if the response has not the right file type then reject file
224: this .log.logInfo("REJECTED WRONG MIME/EXT TYPE "
225: + mimeType + " for URL "
226: + entry.url().toString());
227: sb.crawlQueues.errorURL
228: .newEntry(
229: entry,
230: null,
231: new Date(),
232: 1,
233: plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
234: return null;
235: }
236: }
237:
238: // pass the downloaded resource to the cache manager
239: if (berr.size() > 0 || htCache == null) {
240: // if the response has not the right file type then reject file
241: this .log.logWarning("Unable to download URL "
242: + entry.url().toString() + "\nErrorlog: "
243: + berr.toString());
244: sb.crawlQueues.errorURL.newEntry(entry, null,
245: new Date(), 1,
246: plasmaCrawlEURL.DENIED_SERVER_DOWNLOAD_ERROR);
247:
248: // an error has occured. cleanup
249: if (cacheFile.exists())
250: cacheFile.delete();
251: } else {
252: // announce the file
253: plasmaHTCache.writeFileAnnouncement(cacheFile);
254: }
255:
256: return htCache;
257: } finally {
258: // closing connection
259: ftpClient.exec("close", false);
260: ftpClient.exec("exit", false);
261: }
262: }
263:
264: }
|