01: /* CrawlStateUpdater
02: *
03: * Created on Jun 5, 2003
04: *
05: * Copyright (C) 2003 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.postprocessor;
24:
25: import java.util.logging.Logger;
26:
27: import org.apache.commons.httpclient.URIException;
28: import org.archive.crawler.datamodel.CoreAttributeConstants;
29: import org.archive.crawler.datamodel.CrawlHost;
30: import org.archive.crawler.datamodel.CrawlServer;
31: import org.archive.crawler.datamodel.CrawlURI;
32: import org.archive.crawler.datamodel.FetchStatusCodes;
33: import org.archive.crawler.framework.Processor;
34: import org.archive.crawler.framework.Frontier.FrontierGroup;
35:
36: /**
37: * A step, late in the processing of a CrawlURI, for updating the per-host
38: * information that may have been affected by the fetch. This will initially
39: * be robots and ip address info; it could include other per-host stats that
40: * would affect the crawl (like total pages visited at the site) as well.
41: *
42: * @author gojomo
43: * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
44: */
45: public class CrawlStateUpdater extends Processor implements
46: CoreAttributeConstants, FetchStatusCodes {
47:
48: private static final long serialVersionUID = -1072728147960180091L;
49:
50: private static final Logger logger = Logger
51: .getLogger(CrawlStateUpdater.class.getName());
52:
53: public CrawlStateUpdater(String name) {
54: super (name, "Crawl state updater");
55: }
56:
57: protected void innerProcess(CrawlURI curi) {
58: // Tally per-server, per-host, per-frontier-class running totals
59: CrawlServer server = getController().getServerCache()
60: .getServerFor(curi);
61: if (server != null) {
62: server.getSubstats().tally(curi);
63: }
64: CrawlHost host = getController().getServerCache().getHostFor(
65: curi);
66: if (host != null) {
67: host.getSubstats().tally(curi);
68: }
69: FrontierGroup group = getController().getFrontier().getGroup(
70: curi);
71: group.getSubstats().tally(curi);
72:
73: String scheme = curi.getUURI().getScheme().toLowerCase();
74: if (scheme.equals("http") || scheme.equals("https")
75: && server != null) {
76: // Update connection problems counter
77: if (curi.getFetchStatus() == S_CONNECT_FAILED) {
78: server.incrementConsecutiveConnectionErrors();
79: } else if (curi.getFetchStatus() > 0) {
80: server.resetConsecutiveConnectionErrors();
81: }
82:
83: // Update robots info
84: try {
85: if (curi.getUURI().getPath() != null
86: && curi.getUURI().getPath().equals(
87: "/robots.txt")) {
88: // Update server with robots info
89: server.updateRobots(curi);
90: }
91: } catch (URIException e) {
92: logger.severe("Failed get path on " + curi.getUURI());
93: }
94: }
95: }
96: }
|