Source Code Cross Referenced for CrawlServer.java in » Web-Crawler » heritrix » org » archive » crawler » datamodel » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.datamodel

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * CrawlServer.java
020:         * Created on Apr 17, 2003
021:         *
022:         * $Header$
023:         */
024:        package org.archive.crawler.datamodel;
025:
026:        import java.io.BufferedReader;
027:        import java.io.IOException;
028:        import java.io.InputStreamReader;
029:        import java.io.ObjectInputStream;
030:        import java.io.Serializable;
031:        import java.io.StringReader;
032:        import java.util.HashSet;
033:        import java.util.Set;
034:        import java.util.zip.Checksum;
035:
036:        import org.apache.commons.httpclient.URIException;
037:        import org.archive.crawler.datamodel.credential.CredentialAvatar;
038:        import org.archive.crawler.framework.Checkpointer;
039:        import org.archive.crawler.framework.ToeThread;
040:        import org.archive.crawler.settings.CrawlerSettings;
041:        import org.archive.crawler.settings.SettingsHandler;
042:        import org.archive.io.ReplayInputStream;
043:        import org.archive.net.UURIFactory;
044:
045:        /**
046:         * Represents a single remote "server".
047:         *
048:         * A server is a service on a host. There might be more than one service on a
049:         * host differentiated by a port number.
050:         *
051:         * @author gojomo
052:         */
053:        public class CrawlServer implements  Serializable,
054:                CrawlSubstats.HasCrawlSubstats {
055:
056:            private static final long serialVersionUID = -989714570750970369L;
057:
058:            public static final long ROBOTS_NOT_FETCHED = -1;
059:            /** only check if robots-fetch is perhaps superfluous 
060:             * after this many tries */
061:            public static final long MIN_ROBOTS_RETRIES = 2;
062:
063:            private final String server; // actually, host+port in the https case
064:            private int port;
065:            private transient SettingsHandler settingsHandler;
066:            private RobotsExclusionPolicy robots;
067:            long robotsFetched = ROBOTS_NOT_FETCHED;
068:            boolean validRobots = false;
069:            Checksum robotstxtChecksum;
070:            CrawlSubstats substats = new CrawlSubstats();
071:
072:            // how many consecutive connection errors have been encountered;
073:            // used to drive exponentially increasing retry timeout or decision
074:            // to 'freeze' entire class (queue) of URIs
075:            protected int consecutiveConnectionErrors = 0;
076:
077:            /**
078:             * Set of credential avatars.
079:             */
080:            private transient Set<CredentialAvatar> avatars = null;
081:
082:            /**
083:             * Creates a new CrawlServer object.
084:             *
085:             * @param h the host string for the server.
086:             */
087:            public CrawlServer(String h) {
088:                // TODO: possibly check for illegal host string
089:                server = h;
090:                int colonIndex = server.lastIndexOf(":");
091:                if (colonIndex < 0) {
092:                    port = -1;
093:                } else {
094:                    try {
095:                        port = Integer.parseInt(server
096:                                .substring(colonIndex + 1));
097:                    } catch (NumberFormatException e) {
098:                        port = -1;
099:                    }
100:                }
101:            }
102:
103:            /** Get the robots exclusion policy for this server.
104:             *
105:             * @return the robots exclusion policy for this server.
106:             */
107:            public RobotsExclusionPolicy getRobots() {
108:                return robots;
109:            }
110:
111:            /** Set the robots exclusion policy for this server.
112:             *
113:             * @param policy the policy to set.
114:             */
115:            public void setRobots(RobotsExclusionPolicy policy) {
116:                robots = policy;
117:            }
118:
119:            public String toString() {
120:                return "CrawlServer(" + server + ")";
121:            }
122:
123:            /** Update the robots exclusion policy.
124:             *
125:             * @param curi the crawl URI containing the fetched robots.txt
126:             * @throws IOException
127:             */
128:            public void updateRobots(CrawlURI curi) {
129:                RobotsHonoringPolicy honoringPolicy = settingsHandler
130:                        .getOrder().getRobotsHonoringPolicy();
131:
132:                robotsFetched = System.currentTimeMillis();
133:
134:                boolean gotSomething = curi.getFetchStatus() > 0
135:                        && curi.isHttpTransaction();
136:                if (!gotSomething
137:                        && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
138:                    // robots.txt lookup failed, no reason to consider IGNORE yet
139:                    validRobots = false;
140:                    return;
141:                }
142:
143:                CrawlerSettings settings = getSettings(curi);
144:                int type = honoringPolicy.getType(settings);
145:                if (type == RobotsHonoringPolicy.IGNORE) {
146:                    // IGNORE = ALLOWALL
147:                    robots = RobotsExclusionPolicy.ALLOWALL;
148:                    validRobots = true;
149:                    return;
150:                }
151:
152:                if (!gotSomething) {
153:                    // robots.txt lookup failed and policy not IGNORE
154:                    validRobots = false;
155:                    return;
156:                }
157:
158:                if (!curi.is2XXSuccess()) {
159:                    // Not found or anything but a status code in the 2xx range is
160:                    // treated as giving access to all of a sites' content.
161:                    // This is the prevailing practice of Google, since 4xx
162:                    // responses on robots.txt are usually indicative of a 
163:                    // misconfiguration or blanket-block, not an intentional
164:                    // indicator of partial blocking. 
165:                    // TODO: consider handling server errors, redirects differently
166:                    robots = RobotsExclusionPolicy.ALLOWALL;
167:                    validRobots = true;
168:                    return;
169:                }
170:
171:                ReplayInputStream contentBodyStream = null;
172:                try {
173:                    try {
174:                        BufferedReader reader;
175:                        if (type == RobotsHonoringPolicy.CUSTOM) {
176:                            reader = new BufferedReader(new StringReader(
177:                                    honoringPolicy.getCustomRobots(settings)));
178:                        } else {
179:                            contentBodyStream = curi.getHttpRecorder()
180:                                    .getRecordedInput()
181:                                    .getContentReplayInputStream();
182:
183:                            contentBodyStream.setToResponseBodyStart();
184:                            reader = new BufferedReader(new InputStreamReader(
185:                                    contentBodyStream));
186:                        }
187:                        robots = RobotsExclusionPolicy.policyFor(settings,
188:                                reader, honoringPolicy);
189:                        validRobots = true;
190:                    } finally {
191:                        if (contentBodyStream != null) {
192:                            contentBodyStream.close();
193:                        }
194:                    }
195:                } catch (IOException e) {
196:                    robots = RobotsExclusionPolicy.ALLOWALL;
197:                    validRobots = true;
198:                    curi.addLocalizedError(getName(), e,
199:                            "robots.txt parsing IOException");
200:                }
201:            }
202:
203:            /**
204:             * @return Returns the time when robots.txt was fetched.
205:             */
206:            public long getRobotsFetchedTime() {
207:                return robotsFetched;
208:            }
209:
210:            /**
211:             * @return The server string which might include a port number.
212:             */
213:            public String getName() {
214:                return server;
215:            }
216:
217:            /** Get the port number for this server.
218:             *
219:             * @return the port number or -1 if not known (uses default for protocol)
220:             */
221:            public int getPort() {
222:                return port;
223:            }
224:
225:            /** 
226:             * Called when object is being deserialized.
227:             * In addition to the default java deserialization, this method
228:             * re-establishes the references to settings handler and robots honoring
229:             * policy.
230:             *
231:             * @param stream the stream to deserialize from.
232:             * @throws IOException if I/O errors occur
233:             * @throws ClassNotFoundException If the class for an object being restored
234:             *         cannot be found.
235:             */
236:            private void readObject(ObjectInputStream stream)
237:                    throws IOException, ClassNotFoundException {
238:                stream.defaultReadObject();
239:                Thread t = Thread.currentThread();
240:                if (t instanceof  Checkpointer.CheckpointingThread) {
241:                    settingsHandler = ((Checkpointer.CheckpointingThread) t)
242:                            .getController().getSettingsHandler();
243:                } else if (t instanceof  ToeThread) {
244:                    settingsHandler = ((ToeThread) Thread.currentThread())
245:                            .getController().getSettingsHandler();
246:                } else {
247:                    // TODO: log differently? (if no throw here
248:                    // NPE is inevitable)
249:                    throw new RuntimeException("CrawlServer must deserialize "
250:                            + "in a ToeThread or CheckpointingThread");
251:                }
252:                postDeserialize();
253:            }
254:
255:            private void postDeserialize() {
256:                if (this .robots != null) {
257:                    RobotsHonoringPolicy honoringPolicy = settingsHandler
258:                            .getOrder().getRobotsHonoringPolicy();
259:                    this .robots.honoringPolicy = honoringPolicy;
260:                }
261:            }
262:
263:            /** Get the settings handler.
264:             *
265:             * @return the settings handler.
266:             */
267:            public SettingsHandler getSettingsHandler() {
268:                return this .settingsHandler;
269:            }
270:
271:            /** Get the settings object in effect for this server.
272:             * @param curi
273:             *
274:             * @return the settings object in effect for this server.
275:             * @throws URIException
276:             */
277:            private CrawlerSettings getSettings(CandidateURI curi) {
278:                try {
279:                    return this .settingsHandler.getSettings(curi.getUURI()
280:                            .getReferencedHost(), curi.getUURI());
281:                } catch (URIException e) {
282:                    return null;
283:                }
284:            }
285:
286:            /** Set the settings handler to be used by this server.
287:             *
288:             * @param settingsHandler the settings handler to be used by this server.
289:             */
290:            public void setSettingsHandler(SettingsHandler settingsHandler) {
291:                this .settingsHandler = settingsHandler;
292:            }
293:
294:            public void incrementConsecutiveConnectionErrors() {
295:                this .consecutiveConnectionErrors++;
296:            }
297:
298:            public void resetConsecutiveConnectionErrors() {
299:                this .consecutiveConnectionErrors = 0;
300:            }
301:
302:            /**
303:             * @return Credential avatars for this server.  Returns null if none.
304:             */
305:            public Set getCredentialAvatars() {
306:                return this .avatars;
307:            }
308:
309:            /**
310:             * @return True if there are avatars attached to this instance.
311:             */
312:            public boolean hasCredentialAvatars() {
313:                return this .avatars != null && this .avatars.size() > 0;
314:            }
315:
316:            /**
317:             * Add an avatar.
318:             *
319:             * @param ca Credential avatar to add to set of avatars.
320:             */
321:            public void addCredentialAvatar(CredentialAvatar ca) {
322:                if (this .avatars == null) {
323:                    this .avatars = new HashSet<CredentialAvatar>();
324:                }
325:                this .avatars.add(ca);
326:            }
327:
328:            /**
329:             * If true then valid robots.txt information has been retrieved. If false
330:             * either no attempt has been made to fetch robots.txt or the attempt
331:             * failed.
332:             *
333:             * @return Returns the validRobots.
334:             */
335:            public boolean isValidRobots() {
336:                return validRobots;
337:            }
338:
339:            /**
340:             * Get key to use doing lookup on server instances.
341:             * @param cauri CandidateURI we're to get server key for.
342:             * @return String to use as server key.
343:             * @throws URIException
344:             */
345:            public static String getServerKey(CandidateURI cauri)
346:                    throws URIException {
347:                // TODO: evaluate if this is really necessary -- why not 
348:                // make the server of a dns CandidateURI the looked-up domain,
349:                // also simplifying FetchDNS?
350:                String key = cauri.getUURI().getAuthorityMinusUserinfo();
351:                if (key == null) {
352:                    // Fallback for cases where getAuthority() fails (eg 'dns:'.
353:                    // DNS UURIs have the 'domain' in the 'path' parameter, not
354:                    // in the authority).
355:                    key = cauri.getUURI().getCurrentHierPath();
356:                    if (key != null && !key.matches("[-_\\w\\.:]+")) {
357:                        // Not just word chars and dots and colons and dashes and
358:                        // underscores; throw away
359:                        key = null;
360:                    }
361:                }
362:                if (key != null
363:                        && cauri.getUURI().getScheme()
364:                                .equals(UURIFactory.HTTPS)) {
365:                    // If https and no port specified, add default https port to
366:                    // distinuish https from http server without a port.
367:                    if (!key.matches(".+:[0-9]+")) {
368:                        key += ":" + UURIFactory.HTTPS_PORT;
369:                    }
370:                }
371:                return key;
372:            }
373:
374:            /* (non-Javadoc)
375:             * @see org.archive.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats()
376:             */
377:            public CrawlSubstats getSubstats() {
378:                return substats;
379:            }
380:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.