Source Code Cross Referenced for FetchDNS.java in  » Web-Crawler » heritrix » org » archive » crawler » fetcher » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.fetcher 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * FetchDNS
020:         * Created on Jun 5, 2003
021:         *
022:         * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/fetcher/FetchDNS.java,v 1.29.4.1 2007/01/13 01:31:17 stack-sf Exp $
023:         */
024:        package org.archive.crawler.fetcher;
025:
026:        import java.io.ByteArrayInputStream;
027:        import java.io.ByteArrayOutputStream;
028:        import java.io.IOException;
029:        import java.io.InputStream;
030:        import java.net.InetAddress;
031:        import java.net.UnknownHostException;
032:        import java.security.MessageDigest;
033:        import java.util.logging.Level;
034:        import java.util.logging.Logger;
035:        import java.util.regex.Matcher;
036:
037:        import org.apache.commons.httpclient.URIException;
038:        import org.archive.crawler.datamodel.CoreAttributeConstants;
039:        import org.archive.crawler.datamodel.CrawlHost;
040:        import org.archive.crawler.datamodel.CrawlURI;
041:        import org.archive.crawler.datamodel.FetchStatusCodes;
042:        import org.archive.crawler.framework.Processor;
043:        import org.archive.crawler.settings.SimpleType;
044:        import org.archive.util.ArchiveUtils;
045:        import org.archive.util.HttpRecorder;
046:        import org.archive.util.InetAddressUtil;
047:        import org.xbill.DNS.ARecord;
048:        import org.xbill.DNS.DClass;
049:        import org.xbill.DNS.Lookup;
050:        import org.xbill.DNS.Record;
051:        import org.xbill.DNS.ResolverConfig;
052:        import org.xbill.DNS.TextParseException;
053:        import org.xbill.DNS.Type;
054:
055:        /**
056:         * Processor to resolve 'dns:' URIs.
057:         * 
058:         * TODO: Refactor to use org.archive.util.DNSJavaUtils.
059:         *
060:         * @author multiple
061:         */
062:        public class FetchDNS extends Processor implements 
063:                CoreAttributeConstants, FetchStatusCodes {
064:            private static final long serialVersionUID = 4686199203459704426L;
065:
066:            private Logger logger = Logger.getLogger(this .getClass().getName());
067:
068:            // Defaults.
069:            private short ClassType = DClass.IN;
070:            private short TypeType = Type.A;
071:            protected InetAddress serverInetAddr = null;
072:
073:            private static final String ATTR_ACCEPT_NON_DNS_RESOLVES = "accept-non-dns-resolves";
074:            private static final Boolean DEFAULT_ACCEPT_NON_DNS_RESOLVES = Boolean.FALSE;
075:            private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES = 6 * 60 * 60; // 6 hrs
076:
077:            private byte[] reusableBuffer = new byte[1024];
078:
079:            /** 
080:             * Create a new instance of FetchDNS.
081:             *
082:             * @param name the name of this attribute.
083:             */
084:            public FetchDNS(String name) {
085:                super (name, "DNS Fetcher. Handles DNS lookups.");
086:                org.archive.crawler.settings.Type e = addElementToDefinition(new SimpleType(
087:                        ATTR_ACCEPT_NON_DNS_RESOLVES,
088:                        "If a DNS lookup fails, whether or not to fallback to "
089:                                + "InetAddress resolution, which may use local 'hosts' files "
090:                                + "or other mechanisms.",
091:                        DEFAULT_ACCEPT_NON_DNS_RESOLVES));
092:                e.setExpertSetting(true);
093:                e = addElementToDefinition(new SimpleType(
094:                        FetchHTTP.ATTR_DIGEST_CONTENT,
095:                        "Whether or not to perform an on-the-fly digest hash of"
096:                                + " retrieved content-bodies.",
097:                        FetchHTTP.DEFAULT_DIGEST_CONTENT));
098:                e.setExpertSetting(true);
099:                e = addElementToDefinition(new SimpleType(
100:                        FetchHTTP.ATTR_DIGEST_ALGORITHM,
101:                        "Which algorithm (for example "
102:                                + "MD5 or SHA-1) to use to perform an on-the-fly digest"
103:                                + " hash of retrieved content-bodies.",
104:                        FetchHTTP.DEFAULT_DIGEST_ALGORITHM,
105:                        FetchHTTP.DIGEST_ALGORITHMS));
106:                e.setExpertSetting(true);
107:            }
108:
109:            protected void innerProcess(CrawlURI curi) {
110:                if (!curi.getUURI().getScheme().equals("dns")) {
111:                    // Only handles dns
112:                    return;
113:                }
114:                Record[] rrecordSet = null; // Retrieved dns records
115:                String dnsName = null;
116:                try {
117:                    dnsName = curi.getUURI().getReferencedHost();
118:                } catch (URIException e) {
119:                    logger.log(Level.SEVERE, "Failed parse of dns record "
120:                            + curi, e);
121:                }
122:
123:                if (dnsName == null) {
124:                    curi.setFetchStatus(S_UNFETCHABLE_URI);
125:                    return;
126:                }
127:
128:                // Make sure we're in "normal operating mode", e.g. a cache +
129:                // controller exist to assist us.
130:                CrawlHost targetHost = null;
131:                if (getController() != null
132:                        && getController().getServerCache() != null) {
133:                    targetHost = getController().getServerCache().getHostFor(
134:                            dnsName);
135:                } else {
136:                    // Standalone operation (mostly for test cases/potential other uses)
137:                    targetHost = new CrawlHost(dnsName);
138:                }
139:                if (isQuadAddress(curi, dnsName, targetHost)) {
140:                    // We're done processing.
141:                    return;
142:                }
143:
144:                // Do actual DNS lookup.
145:                curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
146:
147:                // Try to get the records for this host (assume domain name)
148:                // TODO: Bug #935119 concerns potential hang here
149:                try {
150:                    rrecordSet = (new Lookup(dnsName, TypeType, ClassType))
151:                            .run();
152:                } catch (TextParseException e) {
153:                    rrecordSet = null;
154:                }
155:                curi.setContentType("text/dns");
156:                if (rrecordSet != null) {
157:                    if (logger.isLoggable(Level.FINE)) {
158:                        logger.fine("Found recordset for " + dnsName);
159:                    }
160:                    storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
161:                } else {
162:                    if (logger.isLoggable(Level.FINE)) {
163:                        logger.fine("Failed find of recordset for " + dnsName);
164:                    }
165:                    if (((Boolean) getUncheckedAttribute(null,
166:                            ATTR_ACCEPT_NON_DNS_RESOLVES)).booleanValue()) {
167:                        // Do lookup that bypasses javadns.
168:                        InetAddress address = null;
169:                        try {
170:                            address = InetAddress.getByName(dnsName);
171:                        } catch (UnknownHostException e1) {
172:                            address = null;
173:                        }
174:                        if (address != null) {
175:                            targetHost.setIP(address,
176:                                    DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
177:                            curi.setFetchStatus(S_GETBYNAME_SUCCESS);
178:                            if (logger.isLoggable(Level.FINE)) {
179:                                logger.fine("Found address for " + dnsName
180:                                        + " using native dns.");
181:                            }
182:                        } else {
183:                            if (logger.isLoggable(Level.FINE)) {
184:                                logger.fine("Failed find of address for "
185:                                        + dnsName + " using native dns.");
186:                            }
187:                            setUnresolvable(curi, targetHost);
188:                        }
189:                    } else {
190:                        setUnresolvable(curi, targetHost);
191:                    }
192:                }
193:                curi
194:                        .putLong(A_FETCH_COMPLETED_TIME, System
195:                                .currentTimeMillis());
196:            }
197:
198:            protected void storeDNSRecord(final CrawlURI curi,
199:                    final String dnsName, final CrawlHost targetHost,
200:                    final Record[] rrecordSet) {
201:                // Get TTL and IP info from the first A record (there may be
202:                // multiple, e.g. www.washington.edu) then update the CrawlServer
203:                ARecord arecord = getFirstARecord(rrecordSet);
204:                if (arecord == null) {
205:                    throw new NullPointerException("Got null arecord for "
206:                            + dnsName);
207:                }
208:                targetHost.setIP(arecord.getAddress(), arecord.getTTL());
209:                try {
210:                    recordDNS(curi, rrecordSet);
211:                    curi.setFetchStatus(S_DNS_SUCCESS);
212:                    curi.putString(A_DNS_SERVER_IP_LABEL, ResolverConfig
213:                            .getCurrentConfig().server());
214:                } catch (IOException e) {
215:                    logger.log(Level.SEVERE, "Failed store of DNS Record for "
216:                            + curi.toString(), e);
217:                    setUnresolvable(curi, targetHost);
218:                }
219:            }
220:
221:            protected boolean isQuadAddress(final CrawlURI curi,
222:                    final String dnsName, final CrawlHost targetHost) {
223:                boolean result = false;
224:                Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
225:                // If it's an ip no need to do a lookup
226:                if (matcher == null || !matcher.matches()) {
227:                    return result;
228:                }
229:
230:                result = true;
231:                // Ideally this branch would never be reached: no CrawlURI
232:                // would be created for numerical IPs
233:                if (logger.isLoggable(Level.WARNING)) {
234:                    logger.warning("Unnecessary DNS CrawlURI created: " + curi);
235:                }
236:                try {
237:                    targetHost.setIP(InetAddress.getByAddress(dnsName,
238:                            new byte[] {
239:                                    (byte) (new Integer(matcher.group(1))
240:                                            .intValue()),
241:                                    (byte) (new Integer(matcher.group(2))
242:                                            .intValue()),
243:                                    (byte) (new Integer(matcher.group(3))
244:                                            .intValue()),
245:                                    (byte) (new Integer(matcher.group(4))
246:                                            .intValue()) }),
247:                            CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
248:                    curi.setFetchStatus(S_DNS_SUCCESS);
249:                } catch (UnknownHostException e) {
250:                    logger.log(Level.SEVERE, "Should never be "
251:                            + e.getMessage(), e);
252:                    setUnresolvable(curi, targetHost);
253:                }
254:                return result;
255:            }
256:
257:            protected void recordDNS(final CrawlURI curi,
258:                    final Record[] rrecordSet) throws IOException {
259:                final byte[] dnsRecord = getDNSRecord(curi
260:                        .getLong(A_FETCH_BEGAN_TIME), rrecordSet);
261:                HttpRecorder rec = HttpRecorder.getHttpRecorder();
262:
263:                // Shall we get a digest on the content downloaded?
264:                boolean digestContent = ((Boolean) getUncheckedAttribute(curi,
265:                        FetchHTTP.ATTR_DIGEST_CONTENT)).booleanValue();
266:                String algorithm = null;
267:                if (digestContent) {
268:                    algorithm = ((String) getUncheckedAttribute(curi,
269:                            FetchHTTP.ATTR_DIGEST_ALGORITHM));
270:                    rec.getRecordedInput().setDigest(algorithm);
271:                } else {
272:                    // clear
273:                    rec.getRecordedInput().setDigest((MessageDigest) null);
274:                }
275:
276:                curi.setHttpRecorder(rec);
277:                InputStream is = curi.getHttpRecorder().inputWrap(
278:                        new ByteArrayInputStream(dnsRecord));
279:                if (digestContent) {
280:                    rec.getRecordedInput().startDigest();
281:                }
282:                // Reading from the wrapped stream, behind the scenes, will write
283:                // files into scratch space
284:                try {
285:                    while (is.read(this .reusableBuffer) != -1) {
286:                        continue;
287:                    }
288:                } finally {
289:                    is.close();
290:                    rec.closeRecorders();
291:                }
292:                curi.setContentSize(dnsRecord.length);
293:                if (digestContent) {
294:                    curi.setContentDigest(algorithm, rec.getRecordedInput()
295:                            .getDigestValue());
296:                }
297:            }
298:
299:            protected byte[] getDNSRecord(final long fetchStart,
300:                    final Record[] rrecordSet) throws IOException {
301:                ByteArrayOutputStream baos = new ByteArrayOutputStream();
302:                // Start the record with a 14-digit date per RFC 2540
303:                byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart)
304:                        .getBytes();
305:                baos.write(fetchDate);
306:                // Don't forget the newline
307:                baos.write("\n".getBytes());
308:                int recordLength = fetchDate.length + 1;
309:                if (rrecordSet != null) {
310:                    for (int i = 0; i < rrecordSet.length; i++) {
311:                        byte[] record = rrecordSet[i].toString().getBytes();
312:                        recordLength += record.length;
313:                        baos.write(record);
314:                        // Add the newline between records back in
315:                        baos.write("\n".getBytes());
316:                        recordLength += 1;
317:                    }
318:                }
319:                return baos.toByteArray();
320:            }
321:
322:            protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
323:                host.setIP(null, 0);
324:                curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE);
325:            }
326:
327:            protected ARecord getFirstARecord(Record[] rrecordSet) {
328:                ARecord arecord = null;
329:                if (rrecordSet == null || rrecordSet.length == 0) {
330:                    if (logger.isLoggable(Level.FINEST)) {
331:                        logger.finest("rrecordSet is null or zero length: "
332:                                + rrecordSet);
333:                    }
334:                    return arecord;
335:                }
336:                for (int i = 0; i < rrecordSet.length; i++) {
337:                    if (rrecordSet[i].getType() != Type.A) {
338:                        if (logger.isLoggable(Level.FINEST)) {
339:                            logger.finest("Record " + Integer.toString(i)
340:                                    + " is not A type but "
341:                                    + rrecordSet[i].getType());
342:                        }
343:                        continue;
344:                    }
345:                    arecord = (ARecord) rrecordSet[i];
346:                    break;
347:                }
348:                return arecord;
349:            }
350:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.