001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * FetchDNS
020: * Created on Jun 5, 2003
021: *
022: * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/fetcher/FetchDNS.java,v 1.29.4.1 2007/01/13 01:31:17 stack-sf Exp $
023: */
024: package org.archive.crawler.fetcher;
025:
026: import java.io.ByteArrayInputStream;
027: import java.io.ByteArrayOutputStream;
028: import java.io.IOException;
029: import java.io.InputStream;
030: import java.net.InetAddress;
031: import java.net.UnknownHostException;
032: import java.security.MessageDigest;
033: import java.util.logging.Level;
034: import java.util.logging.Logger;
035: import java.util.regex.Matcher;
036:
037: import org.apache.commons.httpclient.URIException;
038: import org.archive.crawler.datamodel.CoreAttributeConstants;
039: import org.archive.crawler.datamodel.CrawlHost;
040: import org.archive.crawler.datamodel.CrawlURI;
041: import org.archive.crawler.datamodel.FetchStatusCodes;
042: import org.archive.crawler.framework.Processor;
043: import org.archive.crawler.settings.SimpleType;
044: import org.archive.util.ArchiveUtils;
045: import org.archive.util.HttpRecorder;
046: import org.archive.util.InetAddressUtil;
047: import org.xbill.DNS.ARecord;
048: import org.xbill.DNS.DClass;
049: import org.xbill.DNS.Lookup;
050: import org.xbill.DNS.Record;
051: import org.xbill.DNS.ResolverConfig;
052: import org.xbill.DNS.TextParseException;
053: import org.xbill.DNS.Type;
054:
055: /**
056: * Processor to resolve 'dns:' URIs.
057: *
058: * TODO: Refactor to use org.archive.util.DNSJavaUtils.
059: *
060: * @author multiple
061: */
062: public class FetchDNS extends Processor implements
063: CoreAttributeConstants, FetchStatusCodes {
064: private static final long serialVersionUID = 4686199203459704426L;
065:
066: private Logger logger = Logger.getLogger(this .getClass().getName());
067:
068: // Defaults.
069: private short ClassType = DClass.IN;
070: private short TypeType = Type.A;
071: protected InetAddress serverInetAddr = null;
072:
073: private static final String ATTR_ACCEPT_NON_DNS_RESOLVES = "accept-non-dns-resolves";
074: private static final Boolean DEFAULT_ACCEPT_NON_DNS_RESOLVES = Boolean.FALSE;
075: private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES = 6 * 60 * 60; // 6 hrs
076:
077: private byte[] reusableBuffer = new byte[1024];
078:
079: /**
080: * Create a new instance of FetchDNS.
081: *
082: * @param name the name of this attribute.
083: */
084: public FetchDNS(String name) {
085: super (name, "DNS Fetcher. Handles DNS lookups.");
086: org.archive.crawler.settings.Type e = addElementToDefinition(new SimpleType(
087: ATTR_ACCEPT_NON_DNS_RESOLVES,
088: "If a DNS lookup fails, whether or not to fallback to "
089: + "InetAddress resolution, which may use local 'hosts' files "
090: + "or other mechanisms.",
091: DEFAULT_ACCEPT_NON_DNS_RESOLVES));
092: e.setExpertSetting(true);
093: e = addElementToDefinition(new SimpleType(
094: FetchHTTP.ATTR_DIGEST_CONTENT,
095: "Whether or not to perform an on-the-fly digest hash of"
096: + " retrieved content-bodies.",
097: FetchHTTP.DEFAULT_DIGEST_CONTENT));
098: e.setExpertSetting(true);
099: e = addElementToDefinition(new SimpleType(
100: FetchHTTP.ATTR_DIGEST_ALGORITHM,
101: "Which algorithm (for example "
102: + "MD5 or SHA-1) to use to perform an on-the-fly digest"
103: + " hash of retrieved content-bodies.",
104: FetchHTTP.DEFAULT_DIGEST_ALGORITHM,
105: FetchHTTP.DIGEST_ALGORITHMS));
106: e.setExpertSetting(true);
107: }
108:
109: protected void innerProcess(CrawlURI curi) {
110: if (!curi.getUURI().getScheme().equals("dns")) {
111: // Only handles dns
112: return;
113: }
114: Record[] rrecordSet = null; // Retrieved dns records
115: String dnsName = null;
116: try {
117: dnsName = curi.getUURI().getReferencedHost();
118: } catch (URIException e) {
119: logger.log(Level.SEVERE, "Failed parse of dns record "
120: + curi, e);
121: }
122:
123: if (dnsName == null) {
124: curi.setFetchStatus(S_UNFETCHABLE_URI);
125: return;
126: }
127:
128: // Make sure we're in "normal operating mode", e.g. a cache +
129: // controller exist to assist us.
130: CrawlHost targetHost = null;
131: if (getController() != null
132: && getController().getServerCache() != null) {
133: targetHost = getController().getServerCache().getHostFor(
134: dnsName);
135: } else {
136: // Standalone operation (mostly for test cases/potential other uses)
137: targetHost = new CrawlHost(dnsName);
138: }
139: if (isQuadAddress(curi, dnsName, targetHost)) {
140: // We're done processing.
141: return;
142: }
143:
144: // Do actual DNS lookup.
145: curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
146:
147: // Try to get the records for this host (assume domain name)
148: // TODO: Bug #935119 concerns potential hang here
149: try {
150: rrecordSet = (new Lookup(dnsName, TypeType, ClassType))
151: .run();
152: } catch (TextParseException e) {
153: rrecordSet = null;
154: }
155: curi.setContentType("text/dns");
156: if (rrecordSet != null) {
157: if (logger.isLoggable(Level.FINE)) {
158: logger.fine("Found recordset for " + dnsName);
159: }
160: storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
161: } else {
162: if (logger.isLoggable(Level.FINE)) {
163: logger.fine("Failed find of recordset for " + dnsName);
164: }
165: if (((Boolean) getUncheckedAttribute(null,
166: ATTR_ACCEPT_NON_DNS_RESOLVES)).booleanValue()) {
167: // Do lookup that bypasses javadns.
168: InetAddress address = null;
169: try {
170: address = InetAddress.getByName(dnsName);
171: } catch (UnknownHostException e1) {
172: address = null;
173: }
174: if (address != null) {
175: targetHost.setIP(address,
176: DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
177: curi.setFetchStatus(S_GETBYNAME_SUCCESS);
178: if (logger.isLoggable(Level.FINE)) {
179: logger.fine("Found address for " + dnsName
180: + " using native dns.");
181: }
182: } else {
183: if (logger.isLoggable(Level.FINE)) {
184: logger.fine("Failed find of address for "
185: + dnsName + " using native dns.");
186: }
187: setUnresolvable(curi, targetHost);
188: }
189: } else {
190: setUnresolvable(curi, targetHost);
191: }
192: }
193: curi
194: .putLong(A_FETCH_COMPLETED_TIME, System
195: .currentTimeMillis());
196: }
197:
198: protected void storeDNSRecord(final CrawlURI curi,
199: final String dnsName, final CrawlHost targetHost,
200: final Record[] rrecordSet) {
201: // Get TTL and IP info from the first A record (there may be
202: // multiple, e.g. www.washington.edu) then update the CrawlServer
203: ARecord arecord = getFirstARecord(rrecordSet);
204: if (arecord == null) {
205: throw new NullPointerException("Got null arecord for "
206: + dnsName);
207: }
208: targetHost.setIP(arecord.getAddress(), arecord.getTTL());
209: try {
210: recordDNS(curi, rrecordSet);
211: curi.setFetchStatus(S_DNS_SUCCESS);
212: curi.putString(A_DNS_SERVER_IP_LABEL, ResolverConfig
213: .getCurrentConfig().server());
214: } catch (IOException e) {
215: logger.log(Level.SEVERE, "Failed store of DNS Record for "
216: + curi.toString(), e);
217: setUnresolvable(curi, targetHost);
218: }
219: }
220:
221: protected boolean isQuadAddress(final CrawlURI curi,
222: final String dnsName, final CrawlHost targetHost) {
223: boolean result = false;
224: Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
225: // If it's an ip no need to do a lookup
226: if (matcher == null || !matcher.matches()) {
227: return result;
228: }
229:
230: result = true;
231: // Ideally this branch would never be reached: no CrawlURI
232: // would be created for numerical IPs
233: if (logger.isLoggable(Level.WARNING)) {
234: logger.warning("Unnecessary DNS CrawlURI created: " + curi);
235: }
236: try {
237: targetHost.setIP(InetAddress.getByAddress(dnsName,
238: new byte[] {
239: (byte) (new Integer(matcher.group(1))
240: .intValue()),
241: (byte) (new Integer(matcher.group(2))
242: .intValue()),
243: (byte) (new Integer(matcher.group(3))
244: .intValue()),
245: (byte) (new Integer(matcher.group(4))
246: .intValue()) }),
247: CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
248: curi.setFetchStatus(S_DNS_SUCCESS);
249: } catch (UnknownHostException e) {
250: logger.log(Level.SEVERE, "Should never be "
251: + e.getMessage(), e);
252: setUnresolvable(curi, targetHost);
253: }
254: return result;
255: }
256:
257: protected void recordDNS(final CrawlURI curi,
258: final Record[] rrecordSet) throws IOException {
259: final byte[] dnsRecord = getDNSRecord(curi
260: .getLong(A_FETCH_BEGAN_TIME), rrecordSet);
261: HttpRecorder rec = HttpRecorder.getHttpRecorder();
262:
263: // Shall we get a digest on the content downloaded?
264: boolean digestContent = ((Boolean) getUncheckedAttribute(curi,
265: FetchHTTP.ATTR_DIGEST_CONTENT)).booleanValue();
266: String algorithm = null;
267: if (digestContent) {
268: algorithm = ((String) getUncheckedAttribute(curi,
269: FetchHTTP.ATTR_DIGEST_ALGORITHM));
270: rec.getRecordedInput().setDigest(algorithm);
271: } else {
272: // clear
273: rec.getRecordedInput().setDigest((MessageDigest) null);
274: }
275:
276: curi.setHttpRecorder(rec);
277: InputStream is = curi.getHttpRecorder().inputWrap(
278: new ByteArrayInputStream(dnsRecord));
279: if (digestContent) {
280: rec.getRecordedInput().startDigest();
281: }
282: // Reading from the wrapped stream, behind the scenes, will write
283: // files into scratch space
284: try {
285: while (is.read(this .reusableBuffer) != -1) {
286: continue;
287: }
288: } finally {
289: is.close();
290: rec.closeRecorders();
291: }
292: curi.setContentSize(dnsRecord.length);
293: if (digestContent) {
294: curi.setContentDigest(algorithm, rec.getRecordedInput()
295: .getDigestValue());
296: }
297: }
298:
299: protected byte[] getDNSRecord(final long fetchStart,
300: final Record[] rrecordSet) throws IOException {
301: ByteArrayOutputStream baos = new ByteArrayOutputStream();
302: // Start the record with a 14-digit date per RFC 2540
303: byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart)
304: .getBytes();
305: baos.write(fetchDate);
306: // Don't forget the newline
307: baos.write("\n".getBytes());
308: int recordLength = fetchDate.length + 1;
309: if (rrecordSet != null) {
310: for (int i = 0; i < rrecordSet.length; i++) {
311: byte[] record = rrecordSet[i].toString().getBytes();
312: recordLength += record.length;
313: baos.write(record);
314: // Add the newline between records back in
315: baos.write("\n".getBytes());
316: recordLength += 1;
317: }
318: }
319: return baos.toByteArray();
320: }
321:
322: protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
323: host.setIP(null, 0);
324: curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE);
325: }
326:
327: protected ARecord getFirstARecord(Record[] rrecordSet) {
328: ARecord arecord = null;
329: if (rrecordSet == null || rrecordSet.length == 0) {
330: if (logger.isLoggable(Level.FINEST)) {
331: logger.finest("rrecordSet is null or zero length: "
332: + rrecordSet);
333: }
334: return arecord;
335: }
336: for (int i = 0; i < rrecordSet.length; i++) {
337: if (rrecordSet[i].getType() != Type.A) {
338: if (logger.isLoggable(Level.FINEST)) {
339: logger.finest("Record " + Integer.toString(i)
340: + " is not A type but "
341: + rrecordSet[i].getType());
342: }
343: continue;
344: }
345: arecord = (ARecord) rrecordSet[i];
346: break;
347: }
348: return arecord;
349: }
350: }
|