001: /* SupplementaryLinksScoper
002: *
003: * $Id: SupplementaryLinksScoper.java 4911 2007-02-18 19:55:55Z gojomo $
004: *
005: * Created on Oct 2, 2003
006: *
007: * Copyright (C) 2003 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: *
025: */
026: package org.archive.crawler.postprocessor;
027:
028: import java.util.Collection;
029: import java.util.HashSet;
030: import java.util.logging.Level;
031: import java.util.logging.Logger;
032:
033: import javax.management.AttributeNotFoundException;
034:
035: import org.archive.crawler.datamodel.CandidateURI;
036: import org.archive.crawler.datamodel.CrawlURI;
037: import org.archive.crawler.deciderules.DecideRule;
038: import org.archive.crawler.deciderules.DecideRuleSequence;
039: import org.archive.crawler.framework.Filter;
040: import org.archive.crawler.framework.Scoper;
041: import org.archive.crawler.settings.MapType;
042:
043: /**
044: * Run CandidateURI links carried in the passed CrawlURI through a filter
045: * and 'handle' rejections.
046: * Used to do supplementary processing of links after they've been scope
047: * processed and ruled 'in-scope' by LinkScoper. An example of
048: * 'supplementary processing' would check that a Link is intended for
049: * this host to crawl in a multimachine crawl setting. Configure filters to
050: * rule on links. Default handler writes rejected URLs to disk. Subclass
051: * to handle rejected URLs otherwise.
052: * @author stack
053: */
054: public class SupplementaryLinksScoper extends Scoper {
055:
056: private static final long serialVersionUID = -775819977752790418L;
057:
058: private static Logger LOGGER = Logger
059: .getLogger(SupplementaryLinksScoper.class.getName());
060:
061: public static final String ATTR_LINKS_DECIDE_RULES = "link-rules";
062:
063: /**
064: * @param name Name of this filter.
065: */
066: public SupplementaryLinksScoper(String name) {
067: super (
068: name,
069: "SupplementaryLinksScoper. Use to do supplementary "
070: + "processing of in-scope links. Will run each link through "
071: + "configured filters. Must be run after LinkScoper and "
072: + "before FrontierScheduler. "
073: + "Optionally logs rejected links (Enable "
074: + ATTR_OVERRIDE_LOGGER_ENABLED
075: + " and set logger level "
076: + "at INFO or above).");
077:
078: addElementToDefinition(new DecideRuleSequence(
079: ATTR_LINKS_DECIDE_RULES,
080: "DecideRules which if their final decision on a link is "
081: + "REJECT, cause the link to be ruled out-of-scope, even "
082: + "if it had previously been accepted by the main scope."));
083: }
084:
085: protected void innerProcess(final CrawlURI curi) {
086: // If prerequisites or no links, nothing to be done in here.
087: if (curi.hasPrerequisiteUri() || curi.outlinksSize() <= 0) {
088: return;
089: }
090:
091: Collection<CandidateURI> inScopeLinks = new HashSet<CandidateURI>();
092: for (CandidateURI cauri : curi.getOutCandidates()) {
093: if (isInScope(cauri)) {
094: inScopeLinks.add(cauri);
095: }
096: }
097: // Replace current links collection w/ inscopeLinks. May be
098: // an empty collection.
099: curi.replaceOutlinks(inScopeLinks);
100: }
101:
102: protected boolean isInScope(CandidateURI caUri) {
103: // TODO: Fix filters so work on CandidateURI.
104: CrawlURI curi = (caUri instanceof CrawlURI) ? (CrawlURI) caUri
105: : new CrawlURI(caUri.getUURI());
106: boolean result = false;
107: if (rulesAccept(getLinkRules(curi), curi)) {
108: result = true;
109: if (LOGGER.isLoggable(Level.FINER)) {
110: LOGGER.finer("Accepted: " + caUri);
111: }
112: } else {
113: outOfScope(caUri);
114: }
115: return result;
116: }
117:
118: protected DecideRule getLinkRules(Object o) {
119: try {
120: return (DecideRule) getAttribute(o, ATTR_LINKS_DECIDE_RULES);
121: } catch (AttributeNotFoundException e) {
122: throw new RuntimeException(e);
123: }
124: }
125:
126: /**
127: * Called when a CandidateUri is ruled out of scope.
128: * @param caUri CandidateURI that is out of scope.
129: */
130: protected void outOfScope(CandidateURI caUri) {
131: if (!LOGGER.isLoggable(Level.INFO)) {
132: return;
133: }
134: LOGGER.info(caUri.getUURI().toString());
135: }
136: }
|