001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * SurtPrefixScope.java
020: * Created on Oct 1, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.scope;
025:
026: import java.io.File;
027: import java.io.FileReader;
028: import java.io.FileWriter;
029: import java.io.IOException;
030:
031: import org.archive.crawler.datamodel.CandidateURI;
032: import org.archive.crawler.deciderules.DecidingScope;
033: import org.archive.crawler.framework.CrawlController;
034: import org.archive.crawler.settings.SimpleType;
035: import org.archive.crawler.settings.Type;
036: import org.archive.util.SurtPrefixSet;
037:
038: /**
039: * A specialized CrawlScope suitable for the most common crawl needs.
040: *
041: * Roughly, as with other existing CrawlScope variants, SurtPrefixScope's logic
042: * is that a URI is included if:
043: * <pre>
044: * ( isSeed(uri) || focusFilter.accepts(uri) ) ||
045: * transitiveFilter.accepts(uri) ) && ! excludeFilter.accepts(uri)
046: * </pre>
047: * Specifically, SurtPrefixScope uses a SurtFilter to test for focus-inclusion.
048: *
049: * @author gojomo
050: * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
051: */
052: public class SurtPrefixScope extends RefinedScope {
053:
054: private static final long serialVersionUID = 2652008287322770123L;
055:
056: public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
057: public static final String ATTR_SEEDS_AS_SURT_PREFIXES = "seeds-as-surt-prefixes";
058: public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
059:
060: private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES = new Boolean(
061: true);
062:
063: /**
064: * Whether the 'via' of CrawlURIs should also be checked
065: * to see if it is prefixed by the set of SURT prefixes
066: */
067: public static final String ATTR_ALSO_CHECK_VIA = "also-check-via";
068: public static final Boolean DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
069:
070: SurtPrefixSet surtPrefixes = null;
071:
072: public SurtPrefixScope(String name) {
073: super (name);
074: setDescription("SurtPrefixScope: A scope for crawls limited to regions of "
075: + "the web defined by a set of SURT prefixes *Deprecated* "
076: + "Use DecidingScope instead. (The SURT form of "
077: + "a URI has its hostname reordered to ease sorting and "
078: + "grouping by domain hierarchies.)");
079: addElementToDefinition(new SimpleType(
080: ATTR_SURTS_SOURCE_FILE,
081: "Source file from which to infer SURT prefixes. Any URLs "
082: + "in file will be converted to the implied SURT prefix, and "
083: + "literal SURT prefixes may be listed on lines beginning "
084: + "with a '+' character.", ""));
085: addElementToDefinition(new SimpleType(
086: ATTR_SEEDS_AS_SURT_PREFIXES,
087: "Should seeds also be interpreted as SURT prefixes.",
088: DEFAULT_SEEDS_AS_SURT_PREFIXES));
089:
090: Type t = addElementToDefinition(new SimpleType(
091: ATTR_SURTS_DUMP_FILE,
092: "Dump file to save SURT prefixes actually used.", ""));
093: t.setExpertSetting(true);
094: t = addElementToDefinition(new SimpleType(
095: ATTR_ALSO_CHECK_VIA,
096: "Whether to also rule URI in-scope if a "
097: + "URI's 'via' URI (the URI from which it was discovered) "
098: + "in SURT form begins with any of the established prefixes. "
099: + "For example, can be used to accept URIs that are 'one hop "
100: + "off' URIs fitting the SURT prefixes. Default is false.",
101: DEFAULT_ALSO_CHECK_VIA));
102: t.setOverrideable(false);
103: t.setExpertSetting(true);
104:
105: }
106:
107: /* (non-Javadoc)
108: * @see org.archive.crawler.framework.CrawlScope#initialize(org.archive.crawler.framework.CrawlController)
109: */
110: public void initialize(CrawlController controller) {
111: super .initialize(controller);
112: readPrefixes();
113: }
114:
115: /**
116: * Check if a URI is part of this scope.
117: *
118: * @param object
119: * An instance of UURI or of CandidateURI.
120: * @return True if focus filter accepts passed object.
121: */
122: protected synchronized boolean focusAccepts(Object object) {
123: // TODO: eliminate duplication wrt/SurtPrefixedDecideRule.evaluate
124: if (surtPrefixes == null) {
125: readPrefixes();
126: }
127: if ((object instanceof CandidateURI)
128: && ((Boolean) getUncheckedAttribute(null,
129: ATTR_ALSO_CHECK_VIA)).booleanValue()) {
130: if (focusAccepts(((CandidateURI) object).getVia())) {
131: return true;
132: }
133: }
134: String candidateSurt = SurtPrefixSet.getCandidateSurt(object);
135: if (candidateSurt == null) {
136: return false;
137: }
138: return surtPrefixes.containsPrefixOf(candidateSurt);
139: }
140:
141: private void readPrefixes() {
142: surtPrefixes = new SurtPrefixSet();
143: FileReader fr = null;
144:
145: // read SURTs from file, if appropriate
146: String sourcePath = (String) getUncheckedAttribute(null,
147: ATTR_SURTS_SOURCE_FILE);
148: if (sourcePath.length() > 0) {
149: File source = new File(sourcePath);
150: if (!source.isAbsolute()) {
151: source = new File(getSettingsHandler().getOrder()
152: .getController().getDisk(), sourcePath);
153: }
154: try {
155: fr = new FileReader(source);
156: try {
157: surtPrefixes.importFromMixed(fr, true);
158: } finally {
159: fr.close();
160: }
161:
162: } catch (IOException e) {
163: e.printStackTrace();
164: throw new RuntimeException(e);
165: }
166: }
167:
168: // interpret seeds as surts, if appropriate
169: boolean deduceFromSeeds = ((Boolean) getUncheckedAttribute(
170: null, ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue();
171: try {
172: fr = new FileReader(getSeedfile());
173: try {
174: surtPrefixes.importFromMixed(fr, deduceFromSeeds);
175: } finally {
176: fr.close();
177: }
178: } catch (IOException e) {
179: e.printStackTrace();
180: throw new RuntimeException(e);
181: }
182:
183: // dump surts to file, if appropriate
184: String dumpPath = (String) getUncheckedAttribute(null,
185: ATTR_SURTS_DUMP_FILE);
186: if (dumpPath.length() > 0) {
187: File dump = new File(dumpPath);
188: if (!dump.isAbsolute()) {
189: dump = new File(getSettingsHandler().getOrder()
190: .getController().getDisk(), dumpPath);
191: }
192: try {
193: FileWriter fw = new FileWriter(dump);
194: try {
195: surtPrefixes.exportTo(fw);
196: } finally {
197: fw.close();
198: }
199: } catch (IOException e) {
200: e.printStackTrace();
201: throw new RuntimeException(e);
202: }
203: }
204: }
205:
206: /**
207: * Re-read prefixes after an update.
208: *
209: * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
210: */
211: public synchronized void kickUpdate() {
212: super .kickUpdate();
213: // TODO: make conditional on file having actually changed,
214: // perhaps by remembering mod-time
215: readPrefixes();
216: }
217: }
|