001: /* SurtPrefixedDecideRule
002: *
003: * $Id: SurtPrefixedDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
004: *
005: * Created on Apr 5, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.deciderules;
026:
027: import java.io.File;
028: import java.io.FileReader;
029: import java.io.FileWriter;
030: import java.io.IOException;
031:
032: import org.archive.crawler.datamodel.CandidateURI;
033: import org.archive.crawler.framework.CrawlScope;
034: import org.archive.crawler.scope.SeedListener;
035: import org.archive.crawler.settings.SimpleType;
036: import org.archive.crawler.settings.Type;
037: import org.archive.util.SurtPrefixSet;
038:
039: /**
040: * Rule applies configured decision to any URIs that, when
041: * expressed in SURT form, begin with one of the prefixes
042: * in the configured set.
043: *
044: * The set can be filled with SURT prefixes implied or
045: * listed in the seeds file, or another external file.
046: *
047: * The "also-check-via" option to implement "one hop off"
048: * scoping derives from a contribution by Shifra Raffel
049: * of the California Digital Library.
050: *
051: * @author gojomo
052: */
053: public class SurtPrefixedDecideRule extends PredicatedDecideRule
054: implements SeedListener {
055:
056: private static final long serialVersionUID = 2075790126085405015L;
057:
058: //private static final Logger logger =
059: // Logger.getLogger(SurtPrefixedDecideRule.class.getName());
060:
061: public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
062: public static final String ATTR_SEEDS_AS_SURT_PREFIXES = "seeds-as-surt-prefixes";
063: public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
064:
065: private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES = new Boolean(
066: true);
067:
068: /**
069: * Whether every config change should trigger a
070: * rebuilding of the prefix set.
071: */
072: public static final String ATTR_REBUILD_ON_RECONFIG = "rebuild-on-reconfig";
073: public static final Boolean DEFAULT_REBUILD_ON_RECONFIG = Boolean.TRUE;
074:
075: /**
076: * Whether the 'via' of CrawlURIs should also be checked
077: * to see if it is prefixed by the set of SURT prefixes
078: */
079: public static final String ATTR_ALSO_CHECK_VIA = "also-check-via";
080: public static final Boolean DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
081:
082: protected SurtPrefixSet surtPrefixes = null;
083:
084: /**
085: * Usual constructor.
086: * @param name
087: */
088: public SurtPrefixedDecideRule(String name) {
089: super (name);
090: setDescription("SurtPrefixedDecideRule. Makes the configured decision "
091: + "for any URI which, when expressed in SURT form, begins "
092: + "with any of the established prefixes (from either seeds "
093: + "specification or an external file).");
094: addElementToDefinition(new SimpleType(
095: ATTR_SURTS_SOURCE_FILE,
096: "Source file from which to infer SURT prefixes. Any URLs "
097: + "in file will be converted to the implied SURT prefix, and "
098: + "literal SURT prefixes may be listed on lines beginning "
099: + "with a '+' character.", ""));
100: addElementToDefinition(new SimpleType(
101: ATTR_SEEDS_AS_SURT_PREFIXES,
102: "Should seeds also be interpreted as SURT prefixes.",
103: DEFAULT_SEEDS_AS_SURT_PREFIXES));
104: Type t = addElementToDefinition(new SimpleType(
105: ATTR_SURTS_DUMP_FILE,
106: "Dump file to save SURT prefixes actually used: "
107: + "Useful debugging SURTs.", ""));
108: t.setExpertSetting(true);
109: t = addElementToDefinition(new SimpleType(
110: ATTR_ALSO_CHECK_VIA,
111: "Whether to also make the configured decision if a "
112: + "URI's 'via' URI (the URI from which it was discovered) "
113: + "in SURT form begins with any of the established prefixes. "
114: + "For example, can be used to ACCEPT URIs that are 'one hop "
115: + "off' URIs fitting the SURT prefixes. Default is false.",
116: DEFAULT_ALSO_CHECK_VIA));
117: t.setOverrideable(false);
118: t.setExpertSetting(true);
119: t = addElementToDefinition(new SimpleType(
120: ATTR_REBUILD_ON_RECONFIG,
121: "Whether to rebuild the internal structures from source "
122: + "files (including seeds if appropriate) every time any "
123: + "configuration change occurs. If true, "
124: + "rule is rebuilt from sources even when (for example) "
125: + "unrelated new domain overrides are set. Rereading large"
126: + "source files can take a long time.",
127: DEFAULT_REBUILD_ON_RECONFIG));
128: t.setOverrideable(false);
129: t.setExpertSetting(true);
130: }
131:
132: /**
133: * Evaluate whether given object's URI is covered by the SURT prefix set
134: *
135: * @param object Item to evaluate.
136: * @return true if item, as SURT form URI, is prefixed by an item in the set
137: */
138: protected boolean evaluate(Object object) {
139: if ((object instanceof CandidateURI)
140: && ((Boolean) getUncheckedAttribute(null,
141: ATTR_ALSO_CHECK_VIA)).booleanValue()) {
142: if (evaluate(((CandidateURI) object).getVia())) {
143: return true;
144: }
145: }
146: String candidateSurt;
147: candidateSurt = SurtPrefixSet.getCandidateSurt(object);
148: if (candidateSurt == null) {
149: return false;
150: }
151: return getPrefixes().containsPrefixOf(candidateSurt);
152: }
153:
154: /**
155: * Synchronized get of prefix set to use
156: *
157: * @return SurtPrefixSet to use for check
158: */
159: private synchronized SurtPrefixSet getPrefixes() {
160: if (surtPrefixes == null) {
161: readPrefixes();
162: }
163: return surtPrefixes;
164: }
165:
166: protected void readPrefixes() {
167: buildSurtPrefixSet();
168: dumpSurtPrefixSet();
169: }
170:
171: /**
172: * Dump the current prefixes in use to configured dump file (if any)
173: */
174: protected void dumpSurtPrefixSet() {
175: // dump surts to file, if appropriate
176: String dumpPath = (String) getUncheckedAttribute(null,
177: ATTR_SURTS_DUMP_FILE);
178: if (dumpPath.length() > 0) {
179: File dump = new File(dumpPath);
180: if (!dump.isAbsolute()) {
181: dump = new File(getSettingsHandler().getOrder()
182: .getController().getDisk(), dumpPath);
183: }
184: try {
185: FileWriter fw = new FileWriter(dump);
186: try {
187: surtPrefixes.exportTo(fw);
188: } finally {
189: fw.close();
190: }
191: } catch (IOException e) {
192: e.printStackTrace();
193: throw new RuntimeException(e);
194: }
195: }
196: }
197:
198: /**
199: * Construct the set of prefixes to use, from the seed list (
200: * which may include both URIs and '+'-prefixed directives).
201: */
202: protected void buildSurtPrefixSet() {
203: SurtPrefixSet newSurtPrefixes = new SurtPrefixSet();
204: FileReader fr = null;
205:
206: // read SURTs from file, if appropriate
207: String sourcePath = (String) getUncheckedAttribute(null,
208: ATTR_SURTS_SOURCE_FILE);
209: if (sourcePath.length() > 0) {
210: File source = new File(sourcePath);
211: if (!source.isAbsolute()) {
212: source = new File(getSettingsHandler().getOrder()
213: .getController().getDisk(), sourcePath);
214: }
215: try {
216: fr = new FileReader(source);
217: try {
218: newSurtPrefixes.importFromMixed(fr, true);
219: } finally {
220: fr.close();
221: }
222: } catch (IOException e) {
223: e.printStackTrace();
224: throw new RuntimeException(e);
225: }
226: }
227:
228: // interpret seeds as surts, if appropriate
229: boolean deduceFromSeeds = ((Boolean) getUncheckedAttribute(
230: null, ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue();
231: if (deduceFromSeeds) {
232: try {
233: fr = new FileReader(getSeedfile());
234: try {
235: newSurtPrefixes
236: .importFromMixed(fr, deduceFromSeeds);
237: } finally {
238: fr.close();
239: }
240: } catch (IOException e) {
241: e.printStackTrace();
242: throw new RuntimeException(e);
243: }
244: }
245:
246: surtPrefixes = newSurtPrefixes;
247: }
248:
249: /**
250: * Re-read prefixes after an update.
251: *
252: * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
253: */
254: public synchronized void kickUpdate() {
255: super .kickUpdate();
256: if (((Boolean) getUncheckedAttribute(null,
257: ATTR_REBUILD_ON_RECONFIG)).booleanValue()) {
258: readPrefixes();
259: }
260: // TODO: make conditional on file having actually changed,
261: // perhaps by remembering mod-time
262: }
263:
264: /**
265: * Dig through everything to get the crawl-global seeds file.
266: * Add self as listener while at it.
267: *
268: * @return Seed list file
269: */
270: protected File getSeedfile() {
271: CrawlScope scope = getSettingsHandler().getOrder()
272: .getController().getScope();
273: scope.addSeedListener(this );
274: return scope.getSeedfile();
275: }
276:
277: public synchronized void addedSeed(final CandidateURI curi) {
278: SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes
279: .clone();
280: newSurtPrefixes.add(prefixFrom(curi.toString()));
281: surtPrefixes = newSurtPrefixes;
282: }
283:
284: protected String prefixFrom(String uri) {
285: return SurtPrefixSet.prefixFromPlain(uri);
286: }
287: }
|