001: /* SurtPrefixFilter
002: *
003: * $Id: SurtPrefixFilter.java 4652 2006-09-25 18:41:10Z paul_jack $
004: *
005: * Created on Jul 22, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.filter;
026:
027: import java.io.File;
028: import java.io.FileReader;
029: import java.io.IOException;
030:
031: import org.archive.crawler.deciderules.DecideRule;
032: import org.archive.crawler.deciderules.DecidingFilter;
033: import org.archive.crawler.framework.Filter;
034: import org.archive.crawler.settings.SimpleType;
035: import org.archive.util.SURT;
036: import org.archive.util.SurtPrefixSet;
037:
038: /**
039: * A filter which tests a URI against a set of SURT
040: * prefixes, and if the URI's prefix is in the set,
041: * returns the chosen true/false accepts value.
042: *
043: * @author gojomo
044: * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
045: * equivalent {@link DecideRule}.
046: */
047: public class SurtPrefixFilter extends Filter {
048:
049: private static final long serialVersionUID = -6933592892325852022L;
050:
051: public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
052: public static final String ATTR_MATCH_RETURN_VALUE = "if-match-return";
053:
054: SurtPrefixSet surtPrefixes = null;
055:
056: /**
057: * @param name
058: */
059: public SurtPrefixFilter(String name) {
060: super (name, "SURT prefix filter *Deprecated* Use"
061: + "DecidingFilter and equivalent DecideRule instead.");
062: addElementToDefinition(new SimpleType(ATTR_MATCH_RETURN_VALUE,
063: "What to return when " + "a prefix matches.\n",
064: new Boolean(true)));
065: addElementToDefinition(new SimpleType(
066: ATTR_SURTS_SOURCE_FILE,
067: "Source file from which to infer SURT prefixes. Any URLs "
068: + "in file will be converted to the implied SURT prefix, and "
069: + "literal SURT prefixes may be listed on lines beginning "
070: + "with a '+' character.", ""));
071: }
072:
073: /* (non-Javadoc)
074: * @see org.archive.crawler.framework.Filter#accepts(java.lang.Object)
075: */
076: protected synchronized boolean innerAccepts(Object o) {
077: if (surtPrefixes == null) {
078: readPrefixes();
079: }
080: String s = SURT.fromURI(o.toString());
081: // also want to treat https as http
082: if (s.startsWith("https:")) {
083: s = "http:" + s.substring(6);
084: }
085: // TODO: consider other cases of scheme-indifference?
086: return surtPrefixes.containsPrefixOf(s);
087: }
088:
089: private void readPrefixes() {
090: surtPrefixes = new SurtPrefixSet();
091: String sourcePath = (String) getUncheckedAttribute(null,
092: ATTR_SURTS_SOURCE_FILE);
093: File source = new File(sourcePath);
094: if (!source.isAbsolute()) {
095: source = new File(getSettingsHandler().getOrder()
096: .getController().getDisk(), sourcePath);
097: }
098: FileReader fr = null;
099: try {
100: fr = new FileReader(source);
101: try {
102: surtPrefixes.importFromMixed(fr, true);
103: } finally {
104: fr.close();
105: }
106: } catch (IOException e) {
107: e.printStackTrace();
108: throw new RuntimeException(e);
109: }
110: }
111:
112: /**
113: * Re-read prefixes after a settings update.
114: *
115: */
116: public synchronized void kickUpdate() {
117: super .kickUpdate();
118: // TODO: make conditional on file having actually changed,
119: // perhaps by remembering mod-time
120: readPrefixes();
121: }
122: }
|