001: /* ServerCache
002: *
003: * Created on Nov 19, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.datamodel;
024:
025: import java.util.Map;
026: import java.util.Hashtable;
027: import java.util.logging.Level;
028: import java.util.logging.Logger;
029:
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.framework.CrawlController;
032: import org.archive.crawler.settings.SettingsHandler;
033:
034: /**
035: * Server and Host cache.
036: * @author stack
037: * @version $Date: 2006-09-26 21:49:01 +0000 (Tue, 26 Sep 2006) $, $Revision: 4668 $
038: */
039: public class ServerCache {
040: private static Logger logger = Logger.getLogger(ServerCache.class
041: .getName());
042:
043: protected SettingsHandler settingsHandler = null;
044:
045: /**
046: * hostname[:port] -> CrawlServer.
047: * Set in the initialization.
048: */
049: protected Map<String, CrawlServer> servers = null;
050:
051: /**
052: * hostname -> CrawlHost.
053: * Set in the initialization.
054: */
055: protected Map<String, CrawlHost> hosts = null;
056:
057: /**
058: * Constructor.
059: * Shutdown access to the default constructor by making it protected.
060: */
061: protected ServerCache() {
062: super ();
063: }
064:
065: /**
066: * This constructor creates a ServerCache that is all memory-based using
067: * Hashtables. Used for unit testing only
068: * (Use {@link #ServerCache(CrawlController)} when crawling).
069: * @param sh
070: * @throws Exception
071: */
072: public ServerCache(final SettingsHandler sh) throws Exception {
073: this .settingsHandler = sh;
074: this .servers = new Hashtable<String, CrawlServer>();
075: this .hosts = new Hashtable<String, CrawlHost>();
076: }
077:
078: public ServerCache(final CrawlController c) throws Exception {
079: this .settingsHandler = c.getSettingsHandler();
080: this .servers = c.getBigMap("servers", String.class,
081: CrawlServer.class);
082: this .hosts = c
083: .getBigMap("hosts", String.class, CrawlHost.class);
084: }
085:
086: /**
087: * Get the {@link CrawlServer} associated with <code>name</code>.
088: * @param serverKey Server name we're to return server for.
089: * @return CrawlServer instance that matches the passed server name.
090: */
091: public synchronized CrawlServer getServerFor(String serverKey) {
092: CrawlServer cserver = (CrawlServer) this .servers.get(serverKey);
093: return (cserver != null) ? cserver : createServerFor(serverKey);
094: }
095:
096: protected CrawlServer createServerFor(String s) {
097: CrawlServer cserver = (CrawlServer) this .servers.get(s);
098: if (cserver != null) {
099: return cserver;
100: }
101: // Ensure key is private object
102: String skey = new String(s);
103: cserver = new CrawlServer(skey);
104: cserver.setSettingsHandler(settingsHandler);
105: servers.put(skey, cserver);
106: if (logger.isLoggable(Level.FINER)) {
107: logger.finer("Created server " + s);
108: }
109: return cserver;
110: }
111:
112: /**
113: * Get the {@link CrawlServer} associated with <code>curi</code>.
114: * @param cauri CandidateURI we're to get server from.
115: * @return CrawlServer instance that matches the passed CandidateURI.
116: */
117: public CrawlServer getServerFor(CandidateURI cauri) {
118: CrawlServer cs = null;
119: try {
120: String key = CrawlServer.getServerKey(cauri);
121: // TODOSOMEDAY: make this robust against those rare cases
122: // where authority is not a hostname.
123: if (key != null) {
124: cs = getServerFor(key);
125: }
126: } catch (URIException e) {
127: logger.severe(e.getMessage() + ": " + cauri);
128: e.printStackTrace();
129: } catch (NullPointerException npe) {
130: logger.severe(npe.getMessage() + ": " + cauri);
131: npe.printStackTrace();
132: }
133: return cs;
134: }
135:
136: /**
137: * Get the {@link CrawlHost} associated with <code>name</code>.
138: * @param hostname Host name we're to return Host for.
139: * @return CrawlHost instance that matches the passed Host name.
140: */
141: public synchronized CrawlHost getHostFor(String hostname) {
142: if (hostname == null || hostname.length() == 0) {
143: return null;
144: }
145: CrawlHost host = (CrawlHost) this .hosts.get(hostname);
146: return (host != null) ? host : createHostFor(hostname);
147: }
148:
149: protected CrawlHost createHostFor(String hostname) {
150: if (hostname == null || hostname.length() == 0) {
151: return null;
152: }
153: CrawlHost host = (CrawlHost) this .hosts.get(hostname);
154: if (host != null) {
155: return host;
156: }
157: String hkey = new String(hostname);
158: host = new CrawlHost(hkey);
159: this .hosts.put(hkey, host);
160: if (logger.isLoggable(Level.FINE)) {
161: logger.fine("Created host " + hostname);
162: }
163: return host;
164: }
165:
166: /**
167: * Get the {@link CrawlHost} associated with <code>curi</code>.
168: * @param cauri CandidateURI we're to return Host for.
169: * @return CandidateURI instance that matches the passed Host name.
170: */
171: public CrawlHost getHostFor(CandidateURI cauri) {
172: CrawlHost h = null;
173: try {
174: h = getHostFor(cauri.getUURI().getReferencedHost());
175: } catch (URIException e) {
176: e.printStackTrace();
177: }
178: return h;
179: }
180:
181: /**
182: * @param serverKey Key to use doing lookup.
183: * @return True if a server instance exists.
184: */
185: public boolean containsServer(String serverKey) {
186: return (CrawlServer) servers.get(serverKey) != null;
187: }
188:
189: /**
190: * @param hostKey Key to use doing lookup.
191: * @return True if a host instance exists.
192: */
193: public boolean containsHost(String hostKey) {
194: return (CrawlHost) hosts.get(hostKey) != null;
195: }
196:
197: /**
198: * Called when shutting down the cache so we can do clean up.
199: */
200: public void cleanup() {
201: if (this .hosts != null) {
202: // If we're using a bdb bigmap, the call to clear will
203: // close down the bdb database.
204: this.hosts.clear();
205: this.hosts = null;
206: }
207: if (this.servers != null) {
208: this.servers.clear();
209: this.servers = null;
210: }
211: }
212: }
|