001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * CrawlScope.java
020: * Created on Oct 1, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.framework;
025:
026: import java.io.BufferedReader;
027: import java.io.File;
028: import java.io.FileReader;
029: import java.io.FileWriter;
030: import java.io.IOException;
031: import java.io.Writer;
032: import java.util.HashSet;
033: import java.util.Iterator;
034: import java.util.List;
035: import java.util.Set;
036: import java.util.logging.Logger;
037:
038: import javax.management.AttributeNotFoundException;
039: import javax.management.MBeanException;
040: import javax.management.ReflectionException;
041:
042: import org.apache.commons.httpclient.URIException;
043: import org.archive.crawler.datamodel.CandidateURI;
044: import org.archive.crawler.scope.SeedFileIterator;
045: import org.archive.crawler.scope.SeedListener;
046: import org.archive.crawler.settings.CrawlerSettings;
047: import org.archive.crawler.settings.SimpleType;
048: import org.archive.crawler.settings.Type;
049: import org.archive.net.UURI;
050: import org.archive.util.DevUtils;
051:
052: /**
053: * A CrawlScope instance defines which URIs are "in"
054: * a particular crawl.
055: *
056: * It is essentially a Filter which determines, looking at
057: * the totality of information available about a
058: * CandidateURI/CrawlURI instamce, if that URI should be
059: * scheduled for crawling.
060: *
061: * Dynamic information inherent in the discovery of the
062: * URI -- such as the path by which it was discovered --
063: * may be considered.
064: *
065: * Dynamic information which requires the consultation
066: * of external and potentially volatile information --
067: * such as current robots.txt requests and the history
068: * of attempts to crawl the same URI -- should NOT be
069: * considered. Those potentially high-latency decisions
070: * should be made at another step.
071: *
072: * @author gojomo
073: *
074: */
075: public class CrawlScope extends Filter {
076:
077: private static final long serialVersionUID = -3321533224526211277L;
078:
079: private static final Logger logger = Logger
080: .getLogger(CrawlScope.class.getName());
081: public static final String ATTR_NAME = "scope";
082: public static final String ATTR_SEEDS = "seedsfile";
083:
084: /**
085: * Whether every configu change should trigger a
086: * rereading of the original seeds spec/file.
087: */
088: public static final String ATTR_REREAD_SEEDS_ON_CONFIG = "reread-seeds-on-config";
089: public static final Boolean DEFAULT_REREAD_SEEDS_ON_CONFIG = Boolean.TRUE;
090:
091: protected Set<SeedListener> seedListeners = new HashSet<SeedListener>();
092:
093: /** Constructs a new CrawlScope.
094: *
095: * @param name the name is ignored since it always have to be the value of
096: * the constant ATT_NAME.
097: */
098: public CrawlScope(String name) {
099: // 'name' is never used.
100: super (ATTR_NAME, "Crawl scope");
101: Type t;
102: t = addElementToDefinition(new SimpleType(ATTR_SEEDS,
103: "File from which to extract seeds.", "seeds.txt"));
104: t.setOverrideable(false);
105: t.setExpertSetting(true);
106: t = addElementToDefinition(new SimpleType(
107: ATTR_REREAD_SEEDS_ON_CONFIG,
108: "Whether to reread the seeds specification, whether it has "
109: + "changed or not, every time any configuration change occurs. "
110: + "If true, seeds are reread even when (for example) new "
111: + "domain overrides are set. Rereading the seeds can take a "
112: + "long time with large seed lists.",
113: DEFAULT_REREAD_SEEDS_ON_CONFIG));
114: t.setOverrideable(false);
115: t.setExpertSetting(true);
116:
117: }
118:
119: /** Default constructor.
120: */
121: public CrawlScope() {
122: this (ATTR_NAME);
123: }
124:
125: /**
126: * Initialize is called just before the crawler starts to run.
127: *
128: * The settings system is up and initialized so can be used. This
129: * initialize happens after {@link #earlyInitialize(CrawlerSettings)}.
130: *
131: * @param controller Controller object.
132: */
133: public void initialize(CrawlController controller) {
134: // by default do nothing (subclasses override)
135: }
136:
137: public String toString() {
138: return "CrawlScope<" + getName() + ">";
139: }
140:
141: /**
142: * Refresh seeds.
143: *
144: */
145: public void refreshSeeds() {
146: // by default do nothing (subclasses which cache should override)
147: }
148:
149: /**
150: * @return Seed list file or null if problem getting settings file.
151: */
152: public File getSeedfile() {
153: File file = null;
154: try {
155: file = getSettingsHandler()
156: .getPathRelativeToWorkingDirectory(
157: (String) getAttribute(ATTR_SEEDS));
158: if (!file.exists() || !file.canRead()) {
159: throw new IOException("Seeds file "
160: + file.getAbsolutePath()
161: + " does not exist or unreadable.");
162: }
163: } catch (IOException e) {
164: DevUtils.warnHandle(e, "problem reading seeds");
165: } catch (AttributeNotFoundException e) {
166: DevUtils.warnHandle(e, "problem reading seeds");
167: } catch (MBeanException e) {
168: DevUtils.warnHandle(e, "problem reading seeds");
169: e.printStackTrace();
170: } catch (ReflectionException e) {
171: DevUtils.warnHandle(e, "problem reading seeds");
172: e.printStackTrace();
173: }
174:
175: return file;
176: }
177:
178: /** Check if a URI is in the seeds.
179: *
180: * @param o the URI to check.
181: * @return true if URI is a seed.
182: */
183: protected boolean isSeed(Object o) {
184: return o instanceof CandidateURI && ((CandidateURI) o).isSeed();
185: }
186:
187: /**
188: * @param a First UURI of compare.
189: * @param b Second UURI of compare.
190: * @return True if UURIs are of same host.
191: */
192: protected boolean isSameHost(UURI a, UURI b) {
193: boolean isSameHost = false;
194: if (a != null && b != null) {
195: // getHost can come back null. See
196: // "[ 910120 ] java.net.URI#getHost fails when leading digit"
197: try {
198: if (a.getReferencedHost() != null
199: && b.getReferencedHost() != null) {
200: if (a.getReferencedHost().equals(
201: b.getReferencedHost())) {
202: isSameHost = true;
203: }
204: }
205: } catch (URIException e) {
206: logger.severe("Failed compare of " + a + " " + b + ": "
207: + e.getMessage());
208: }
209: }
210: return isSameHost;
211: }
212:
213: /* (non-Javadoc)
214: * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
215: */
216: public void listUsedFiles(List<String> list) {
217: // Add seed file
218: try {
219: File file = getSettingsHandler()
220: .getPathRelativeToWorkingDirectory(
221: (String) getAttribute(ATTR_SEEDS));
222: list.add(file.getAbsolutePath());
223: } catch (AttributeNotFoundException e) {
224: // TODO Auto-generated catch block
225: e.printStackTrace();
226: } catch (MBeanException e) {
227: // TODO Auto-generated catch block
228: e.printStackTrace();
229: } catch (ReflectionException e) {
230: // TODO Auto-generated catch block
231: e.printStackTrace();
232: }
233: }
234:
235: /**
236: * Take note of a situation (such as settings edit) where
237: * involved reconfiguration (such as reading from external
238: * files) may be necessary.
239: */
240: public void kickUpdate() {
241: // TODO: further improve this so that case with hundreds of
242: // thousands or millions of seeds works better without requiring
243: // this specific settings check
244: if (((Boolean) getUncheckedAttribute(null,
245: ATTR_REREAD_SEEDS_ON_CONFIG)).booleanValue()) {
246: refreshSeeds();
247: getSettingsHandler().getOrder().getController()
248: .getFrontier().loadSeeds();
249: }
250: }
251:
252: /**
253: * Gets an iterator over all configured seeds. Subclasses
254: * which cache seeds in memory can override with more
255: * efficient implementation.
256: *
257: * @return Iterator, perhaps over a disk file, of seeds
258: */
259: public Iterator<UURI> seedsIterator() {
260: return seedsIterator(null);
261: }
262:
263: /**
264: * Gets an iterator over all configured seeds. Subclasses
265: * which cache seeds in memory can override with more
266: * efficient implementation.
267: *
268: * @param ignoredItemWriter optional writer to get ignored seed items report
269: * @return Iterator, perhaps over a disk file, of seeds
270: */
271: public Iterator<UURI> seedsIterator(Writer ignoredItemWriter) {
272: BufferedReader br;
273: try {
274: br = new BufferedReader(new FileReader(getSeedfile()));
275: } catch (IOException e) {
276: throw new RuntimeException(e);
277: }
278: return new SeedFileIterator(br, ignoredItemWriter);
279: }
280:
281: /**
282: * Convenience method to close SeedFileIterator, if appropriate.
283: *
284: * @param iter Iterator to check if SeedFileIterator needing closing
285: */
286: protected void checkClose(Iterator iter) {
287: if (iter instanceof SeedFileIterator) {
288: ((SeedFileIterator) iter).close();
289: }
290: }
291:
292: /**
293: * Add a new seed to scope. By default, simply appends
294: * to seeds file, though subclasses may handle differently.
295: *
296: * <p>This method is *not* sufficient to get the new seed
297: * scheduled in the Frontier for crawling -- it only
298: * affects the Scope's seed record (and decisions which
299: * flow from seeds).
300: *
301: * @param curi CandidateUri to add
302: * @return true if successful, false if add failed for any reason
303: */
304: public boolean addSeed(final CandidateURI curi) {
305: File f = getSeedfile();
306: if (f != null) {
307: try {
308: FileWriter fw = new FileWriter(f, true);
309: // Write to new (last) line the URL.
310: fw.write("\n");
311: fw.write("# Heritrix added seed "
312: + ((curi.getVia() != null) ? "redirect from "
313: + curi.getVia() : "(JMX)") + ".\n");
314: fw.write(curi.toString());
315: fw.flush();
316: fw.close();
317: Iterator iter = seedListeners.iterator();
318: while (iter.hasNext()) {
319: ((SeedListener) iter.next()).addedSeed(curi);
320: }
321: return true;
322: } catch (IOException e) {
323: DevUtils.warnHandle(e, "problem writing new seed");
324: }
325: }
326: return false;
327: }
328:
329: public void addSeedListener(SeedListener sl) {
330: seedListeners.add(sl);
331: }
332: }
|