001: /* FetchFTP.java
002: *
003: * $Id: FetchFTP.java 5080 2007-04-13 20:30:49Z gojomo $
004: *
005: * Created on Jun 5, 2003
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.fetcher;
026:
027: import java.io.IOException;
028: import java.io.UnsupportedEncodingException;
029: import java.net.Socket;
030: import java.net.URLEncoder;
031: import java.util.logging.Level;
032: import java.util.logging.Logger;
033: import java.util.regex.Matcher;
034: import java.util.regex.Pattern;
035:
036: import javax.management.AttributeNotFoundException;
037:
038: import org.apache.commons.httpclient.URIException;
039: import org.apache.commons.net.ftp.FTPCommand;
040: import org.archive.crawler.datamodel.CrawlURI;
041: import org.archive.crawler.datamodel.CoreAttributeConstants;
042: import org.archive.crawler.datamodel.FetchStatusCodes;
043: import org.archive.crawler.extractor.Link;
044: import static org.archive.crawler.extractor.Link.NAVLINK_HOP;
045: import static org.archive.crawler.extractor.Link.NAVLINK_MISC;
046: import org.archive.crawler.framework.Processor;
047: import org.archive.crawler.settings.SimpleType;
048: import org.archive.io.RecordingInputStream;
049: import org.archive.io.ReplayCharSequence;
050: import org.archive.net.ClientFTP;
051: import org.archive.net.FTPException;
052: import org.archive.net.UURI;
053: import org.archive.util.ArchiveUtils;
054: import org.archive.util.HttpRecorder;
055:
056: /**
057: * Fetches documents and directory listings using FTP. This class will also
058: * try to extract FTP "links" from directory listings. For this class to
059: * archive a directory listing, the remote FTP server must support the NLIST
060: * command. Most modern FTP servers should.
061: *
062: * @author pjack
063: *
064: */
065: public class FetchFTP extends Processor implements
066: CoreAttributeConstants {
067:
068: /** Serialization ID; robust against trivial API changes. */
069: private static final long serialVersionUID = ArchiveUtils
070: .classnameBasedUID(FetchFTP.class, 1);
071:
072: /** Logger for this class. */
073: private static Logger logger = Logger.getLogger(FetchFTP.class
074: .getName());
075:
076: /** Pattern for matching directory entries. */
077: private static Pattern DIR = Pattern.compile("(.+)$",
078: Pattern.MULTILINE);
079:
080: /** The name for the <code>username</code> attribute. */
081: final public static String ATTR_USERNAME = "username";
082:
083: /** The description for the <code>username</code> attribute. */
084: final private static String DESC_USERNAME = "The username to send to "
085: + "FTP servers. By convention, the default value of \"anonymous\" is "
086: + "used for publicly available FTP sites.";
087:
088: /** The default value for the <code>username</code> attribute. */
089: final private static String DEFAULT_USERNAME = "anonymous";
090:
091: /** The name for the <code>password</code> attribute. */
092: final public static String ATTR_PASSWORD = "password";
093:
094: /** The description for the <code>password</code> attribute. */
095: final private static String DESC_PASSWORD = "The password to send to "
096: + "FTP servers. By convention, anonymous users send their email address "
097: + "in this field.";
098:
099: /** The default value for the <code>password</code> attribute. */
100: final private static String DEFAULT_PASSWORD = "";
101:
102: /** The name for the <code>extract-from-dirs</code> attribute. */
103: final private static String ATTR_EXTRACT = "extract-from-dirs";
104:
105: /** The description for the <code>extract-from-dirs</code> attribute. */
106: final private static String DESC_EXTRACT = "Set to true to extract "
107: + "further URIs from FTP directories. Default is true.";
108:
109: /** The default value for the <code>extract-from-dirs</code> attribute. */
110: final private static boolean DEFAULT_EXTRACT = true;
111:
112: /** The name for the <code>extract-parent</code> attribute. */
113: final private static String ATTR_EXTRACT_PARENT = "extract_parent";
114:
115: /** The description for the <code>extract-parent</code> attribute. */
116: final private static String DESC_EXTRACT_PARENT = "Set to true to extract "
117: + "the parent URI from all FTP URIs. Default is true.";
118:
119: /** The default value for the <code>extract-parent</code> attribute. */
120: final private static boolean DEFAULT_EXTRACT_PARENT = true;
121:
122: /** The name for the <code>max-length-bytes</code> attribute. */
123: final public static String ATTR_MAX_LENGTH = "max-length-bytes";
124:
125: /** The description for the <code>max-length-bytes</code> attribute. */
126: final private static String DESC_MAX_LENGTH = "Maximum length in bytes to fetch.\n"
127: + "Fetch is truncated at this length. A value of 0 means no limit.";
128:
129: /** The default value for the <code>max-length-bytes</code> attribute. */
130: final private static long DEFAULT_MAX_LENGTH = 0;
131:
132: /** The name for the <code>fetch-bandwidth</code> attribute. */
133: final public static String ATTR_BANDWIDTH = "fetch-bandwidth";
134:
135: /** The description for the <code>fetch-bandwidth</code> attribute. */
136: final private static String DESC_BANDWIDTH = "";
137:
138: /** The default value for the <code>fetch-bandwidth</code> attribute. */
139: final private static int DEFAULT_BANDWIDTH = 0;
140:
141: /** The name for the <code>timeout-seconds</code> attribute. */
142: final public static String ATTR_TIMEOUT = "timeout-seconds";
143:
144: /** The description for the <code>timeout-seconds</code> attribute. */
145: final private static String DESC_TIMEOUT = "If the fetch is not "
146: + "completed in this number of seconds, give up (and retry later).";
147:
148: /** The default value for the <code>timeout-seconds</code> attribute. */
149: final private static int DEFAULT_TIMEOUT = 1200;
150:
151: /**
152: * Constructs a new <code>FetchFTP</code>.
153: *
154: * @param name the name of this processor
155: */
156: public FetchFTP(String name) {
157: super (name, "FTP Fetcher.");
158: add(ATTR_USERNAME, DESC_USERNAME, DEFAULT_USERNAME);
159: add(ATTR_PASSWORD, DESC_PASSWORD, DEFAULT_PASSWORD);
160: add(ATTR_EXTRACT, DESC_EXTRACT, DEFAULT_EXTRACT);
161: add(ATTR_EXTRACT_PARENT, DESC_EXTRACT_PARENT,
162: DEFAULT_EXTRACT_PARENT);
163: add(ATTR_MAX_LENGTH, DESC_MAX_LENGTH, DEFAULT_MAX_LENGTH);
164: add(ATTR_BANDWIDTH, DESC_BANDWIDTH, DEFAULT_BANDWIDTH);
165: add(ATTR_TIMEOUT, DESC_TIMEOUT, DEFAULT_TIMEOUT);
166: }
167:
168: /**
169: * Convenience method for adding an attribute.
170: *
171: * @param name The name of the attribute
172: * @param desc The description of the attribute
173: * @param def The default value for the attribute
174: */
175: private void add(String name, String desc, Object def) {
176: SimpleType st = new SimpleType(name, desc, def);
177: addElementToDefinition(st);
178: }
179:
180: /**
181: * Convenience method for extracting an attribute.
182: * If a value for the specified name cannot be found,
183: * a warning is written to the log and the specified
184: * default value is returned instead.
185: *
186: * @param context The context for the attribute fetch
187: * @param name The name of the attribute to fetch
188: * @param def The value to return if the attribute isn't found
189: * @return The value of that attribute
190: */
191: private Object get(Object context, String name, Object def) {
192: try {
193: return getAttribute(context, name);
194: } catch (AttributeNotFoundException e) {
195: logger.warning("Attribute not found (using default): "
196: + name);
197: return def;
198: }
199: }
200:
201: /**
202: * Processes the given URI. If the given URI is not an FTP URI, then
203: * this method does nothing. Otherwise an attempt is made to connect
204: * to the FTP server.
205: *
206: * <p>If the connection is successful, an attempt will be made to CD to
207: * the path specified in the URI. If the remote CD command succeeds,
208: * then it is assumed that the URI represents a directory. If the
209: * CD command fails, then it is assumed that the URI represents
210: * a file.
211: *
212: * <p>For directories, the directory listing will be fetched using
213: * the FTP LIST command, and saved to the HttpRecorder. If the
214: * <code>extract.from.dirs</code> attribute is set to true, then
215: * the files in the fetched list will be added to the curi as
216: * extracted FTP links. (It was easier to do that here, rather
217: * than writing a separate FTPExtractor.)
218: *
219: * <p>For files, the file will be fetched using the FTP RETR
220: * command, and saved to the HttpRecorder.
221: *
222: * <p>All file transfers (including directory listings) occur using
223: * Binary mode transfer. Also, the local passive transfer mode
224: * is always used, to play well with firewalls.
225: *
226: * @param curi the curi to process
227: * @throws InterruptedException if the thread is interrupted during
228: * processing
229: */
230: public void innerProcess(CrawlURI curi) throws InterruptedException {
231: if (!curi.getUURI().getScheme().equals("ftp")) {
232: return;
233: }
234:
235: curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
236: HttpRecorder recorder = HttpRecorder.getHttpRecorder();
237: ClientFTP client = new ClientFTP();
238:
239: try {
240: fetch(curi, client, recorder);
241: } catch (FTPException e) {
242: logger.log(Level.SEVERE, "FTP server reported problem.", e);
243: curi.setFetchStatus(e.getReplyCode());
244: } catch (IOException e) {
245: logger.log(Level.SEVERE, "IO Error during FTP fetch.", e);
246: curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST);
247: } finally {
248: disconnect(client);
249: curi.setContentSize(recorder.getRecordedInput().getSize());
250: curi.putLong(A_FETCH_COMPLETED_TIME, System
251: .currentTimeMillis());
252: }
253: }
254:
255: /**
256: * Fetches a document from an FTP server.
257: *
258: * @param curi the URI of the document to fetch
259: * @param client the FTPClient to use for the fetch
260: * @param recorder the recorder to preserve the document in
261: * @throws IOException if a network or protocol error occurs
262: * @throws InterruptedException if the thread is interrupted
263: */
264: private void fetch(CrawlURI curi, ClientFTP client,
265: HttpRecorder recorder) throws IOException,
266: InterruptedException {
267: // Connect to the FTP server.
268: UURI uuri = curi.getUURI();
269: int port = uuri.getPort();
270: if (port == -1) {
271: port = 21;
272: }
273: client.connectStrict(uuri.getHost(), port);
274:
275: // Authenticate.
276: String[] auth = getAuth(curi);
277: client.loginStrict(auth[0], auth[1]);
278:
279: // The given resource may or may not be a directory.
280: // To figure out which is which, execute a CD command to
281: // the UURI's path. If CD works, it's a directory.
282: boolean dir = client.changeWorkingDirectory(uuri.getPath());
283: if (dir) {
284: curi.setContentType("text/plain");
285: }
286:
287: // TODO: A future version of this class could use the system string to
288: // set up custom directory parsing if the FTP server doesn't support
289: // the nlist command.
290: if (logger.isLoggable(Level.FINE)) {
291: String system = client.getSystemName();
292: logger.fine(system);
293: }
294:
295: // Get a data socket. This will either be the result of a NLIST
296: // command for a directory, or a RETR command for a file.
297: int command = dir ? FTPCommand.NLST : FTPCommand.RETR;
298: String path = dir ? "." : uuri.getPath();
299: client.enterLocalPassiveMode();
300: client.setBinary();
301: Socket socket = client.openDataConnection(command, path);
302: curi.setFetchStatus(client.getReplyCode());
303:
304: // Save the streams in the CURI, where downstream processors
305: // expect to find them.
306: try {
307: saveToRecorder(curi, socket, recorder);
308: } finally {
309: recorder.close();
310: close(socket);
311: }
312:
313: curi.setFetchStatus(200);
314: if (dir) {
315: extract(curi, recorder);
316: }
317: addParent(curi);
318: }
319:
320: /**
321: * Saves the given socket to the given recorder.
322: *
323: * @param curi the curi that owns the recorder
324: * @param socket the socket whose streams to save
325: * @param recorder the recorder to save them to
326: * @throws IOException if a network or file error occurs
327: * @throws InterruptedException if the thread is interrupted
328: */
329: private void saveToRecorder(CrawlURI curi, Socket socket,
330: HttpRecorder recorder) throws IOException,
331: InterruptedException {
332: curi.setHttpRecorder(recorder);
333: recorder.markContentBegin();
334: recorder.inputWrap(socket.getInputStream());
335: recorder.outputWrap(socket.getOutputStream());
336:
337: // Read the remote file/dir listing in its entirety.
338: long softMax = 0;
339: long hardMax = getMaxLength(curi);
340: long timeout = (long) getTimeout(curi) * 1000;
341: int maxRate = getFetchBandwidth(curi);
342: RecordingInputStream input = recorder.getRecordedInput();
343: input.setLimits(hardMax, timeout, maxRate);
344: input.readFullyOrUntil(softMax);
345: }
346:
347: /**
348: * Extract FTP links in a directory listing.
349: * The listing must already be saved to the given recorder.
350: *
351: * @param curi The curi to save extracted links to
352: * @param recorder The recorder containing the directory listing
353: */
354: private void extract(CrawlURI curi, HttpRecorder recorder) {
355: if (!getExtractFromDirs(curi)) {
356: return;
357: }
358:
359: ReplayCharSequence seq = null;
360: try {
361: seq = recorder.getReplayCharSequence();
362: extract(curi, seq);
363: } catch (IOException e) {
364: logger.log(Level.SEVERE, "IO error during extraction.", e);
365: } catch (RuntimeException e) {
366: logger.log(Level.SEVERE, "IO error during extraction.", e);
367: } finally {
368: close(seq);
369: }
370: }
371:
372: /**
373: * Extracts FTP links in a directory listing.
374: *
375: * @param curi The curi to save extracted links to
376: * @param dir The directory listing to extract links from
377: * @throws URIException if an extracted link is invalid
378: */
379: private void extract(CrawlURI curi, ReplayCharSequence dir) {
380: logger.log(Level.FINEST, "Extracting URIs from FTP directory.");
381: Matcher matcher = DIR.matcher(dir);
382: while (matcher.find()) {
383: String file = matcher.group(1);
384: addExtracted(curi, file);
385: }
386: }
387:
388: /**
389: * Adds an extracted filename to the curi. A new URI will be formed
390: * by taking the given curi (which should represent the directory the
391: * file lives in) and appending the file.
392: *
393: * @param curi the curi to store the discovered link in
394: * @param file the filename of the discovered link
395: */
396: private void addExtracted(CrawlURI curi, String file) {
397: try {
398: file = URLEncoder.encode(file, "UTF-8");
399: } catch (UnsupportedEncodingException e) {
400: throw new AssertionError(e);
401: }
402: if (logger.isLoggable(Level.FINEST)) {
403: logger.log(Level.FINEST, "Found " + file);
404: }
405: String base = curi.toString();
406: if (base.endsWith("/")) {
407: base = base.substring(0, base.length() - 1);
408: }
409: try {
410: UURI n = new UURI(base + "/" + file, true);
411: Link link = new Link(curi.getUURI(), n, NAVLINK_MISC,
412: NAVLINK_HOP);
413: curi.addOutLink(link);
414: } catch (URIException e) {
415: logger
416: .log(Level.WARNING, "URI error during extraction.",
417: e);
418: }
419: }
420:
421: /**
422: * Extracts the parent URI from the given curi, then adds that parent
423: * URI as a discovered link to the curi.
424: *
425: * <p>If the <code>extract-parent</code> attribute is false, then this
426: * method does nothing. Also, if the path of the given curi is
427: * <code>/</code>, then this method does nothing.
428: *
429: * <p>Otherwise the parent is determined by eliminated the lowest part
430: * of the URI's path. Eg, the parent of <code>ftp://foo.com/one/two</code>
431: * is <code>ftp://foo.com/one</code>.
432: *
433: * @param curi the curi whose parent to add
434: */
435: private void addParent(CrawlURI curi) {
436: if (!getExtractParent(curi)) {
437: return;
438: }
439: UURI uuri = curi.getUURI();
440: try {
441: if (uuri.getPath().equals("/")) {
442: // There's no parent to add.
443: return;
444: }
445: String scheme = uuri.getScheme();
446: String auth = uuri.getEscapedAuthority();
447: String path = uuri.getEscapedCurrentHierPath();
448: UURI parent = new UURI(scheme + "://" + auth + path, false);
449:
450: Link link = new Link(uuri, parent, NAVLINK_MISC,
451: NAVLINK_HOP);
452: curi.addOutLink(link);
453: } catch (URIException e) {
454: logger
455: .log(Level.WARNING, "URI error during extraction.",
456: e);
457: }
458: }
459:
460: /**
461: * Returns the <code>extract.from.dirs</code> attribute for this
462: * <code>FetchFTP</code> and the given curi.
463: *
464: * @param curi the curi whose attribute to return
465: * @return that curi's <code>extract.from.dirs</code>
466: */
467: public boolean getExtractFromDirs(CrawlURI curi) {
468: return (Boolean) get(curi, ATTR_EXTRACT, DEFAULT_EXTRACT);
469: }
470:
471: /**
472: * Returns the <code>extract.parent</code> attribute for this
473: * <code>FetchFTP</code> and the given curi.
474: *
475: * @param curi the curi whose attribute to return
476: * @return that curi's <code>extract-parent</code>
477: */
478: public boolean getExtractParent(CrawlURI curi) {
479: return (Boolean) get(curi, ATTR_EXTRACT_PARENT,
480: DEFAULT_EXTRACT_PARENT);
481: }
482:
483: /**
484: * Returns the <code>timeout-seconds</code> attribute for this
485: * <code>FetchFTP</code> and the given curi.
486: *
487: * @param curi the curi whose attribute to return
488: * @return that curi's <code>timeout-seconds</code>
489: */
490: public int getTimeout(CrawlURI curi) {
491: return (Integer) get(curi, ATTR_TIMEOUT, DEFAULT_TIMEOUT);
492: }
493:
494: /**
495: * Returns the <code>max-length-bytes</code> attribute for this
496: * <code>FetchFTP</code> and the given curi.
497: *
498: * @param curi the curi whose attribute to return
499: * @return that curi's <code>max-length-bytes</code>
500: */
501: public long getMaxLength(CrawlURI curi) {
502: return (Long) get(curi, ATTR_MAX_LENGTH, DEFAULT_MAX_LENGTH);
503: }
504:
505: /**
506: * Returns the <code>fetch-bandwidth</code> attribute for this
507: * <code>FetchFTP</code> and the given curi.
508: *
509: * @param curi the curi whose attribute to return
510: * @return that curi's <code>fetch-bandwidth</code>
511: */
512: public int getFetchBandwidth(CrawlURI curi) {
513: return (Integer) get(curi, ATTR_BANDWIDTH, DEFAULT_BANDWIDTH);
514: }
515:
516: /**
517: * Returns the username and password for the given URI. This method
518: * always returns an array of length 2. The first element in the returned
519: * array is the username for the URI, and the second element is the
520: * password.
521: *
522: * <p>If the URI itself contains the username and password (i.e., it looks
523: * like <code>ftp://username:password@host/path</code>) then that username
524: * and password are returned.
525: *
526: * <p>Otherwise the settings system is probed for the <code>username</code>
527: * and <code>password</code> attributes for this <code>FTPFetch</code>
528: * and the given <code>curi</code> context. The values of those
529: * attributes are then returned.
530: *
531: * @param curi the curi whose username and password to return
532: * @return an array containing the username and password
533: */
534: private String[] getAuth(CrawlURI curi) {
535: String[] result = new String[2];
536: UURI uuri = curi.getUURI();
537: String userinfo;
538: try {
539: userinfo = uuri.getUserinfo();
540: } catch (URIException e) {
541: assert false;
542: logger.finest("getUserinfo raised URIException.");
543: userinfo = null;
544: }
545: if (userinfo != null) {
546: int p = userinfo.indexOf(':');
547: if (p > 0) {
548: result[0] = userinfo.substring(0, p);
549: result[1] = userinfo.substring(p + 1);
550: return result;
551: }
552: }
553: result[0] = (String) get(curi, ATTR_USERNAME, DEFAULT_USERNAME);
554: result[1] = (String) get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);
555: return result;
556: }
557:
558: /**
559: * Determines the password for the given URI. If the URI itself contains
560: * a password, then that password is returned. Otherwise the settings
561: * system is probed for the <code>password</code> attribute, and the value
562: * for that attribute is returned.
563: *
564: * @param curi the curi whose password to return
565: * @return that password
566: */
567: public String determinePassword(CrawlURI curi) {
568: return (String) get(curi, ATTR_PASSWORD, DEFAULT_PASSWORD);
569: }
570:
571: /**
572: * Quietly closes the given socket.
573: *
574: * @param socket the socket to close
575: */
576: private static void close(Socket socket) {
577: try {
578: socket.close();
579: } catch (IOException e) {
580: logger.log(Level.WARNING, "IO error closing socket.", e);
581: }
582: }
583:
584: /**
585: * Quietly closes the given sequence.
586: * If an IOException is raised, this method logs it as a warning.
587: *
588: * @param seq the sequence to close
589: */
590: private static void close(ReplayCharSequence seq) {
591: if (seq == null) {
592: return;
593: }
594: try {
595: seq.close();
596: } catch (IOException e) {
597: logger.log(Level.WARNING,
598: "IO error closing ReplayCharSequence.", e);
599: }
600: }
601:
602: /**
603: * Quietly disconnects from the given FTP client.
604: * If an IOException is raised, this method logs it as a warning.
605: *
606: * @param client the client to disconnect
607: */
608: private static void disconnect(ClientFTP client) {
609: if (client.isConnected())
610: try {
611: client.disconnect();
612: } catch (IOException e) {
613: if (logger.isLoggable(Level.WARNING)) {
614: logger
615: .warning("Could not disconnect from FTP client: "
616: + e.getMessage());
617: }
618: }
619: }
620:
621: }
|