001: /* BaseRule
002: *
003: * Created on Oct 5, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.url.canonicalize;
024:
025: import java.util.logging.Logger;
026: import java.util.regex.Matcher;
027:
028: import javax.management.AttributeNotFoundException;
029:
030: import org.archive.crawler.settings.ModuleType;
031: import org.archive.crawler.settings.SimpleType;
032: import org.archive.crawler.url.CanonicalizationRule;
033:
034: /**
035: * Base of all rules applied canonicalizing a URL that are configurable
036: * via the Heritrix settings system.
037: *
038: * This base class is abstact. Subclasses must implement the
039: * {@link CanonicalizationRule#canonicalize(String, Object)} method.
040: *
041: * @author stack
042: * @version $Date: 2005-11-04 23:00:23 +0000 (Fri, 04 Nov 2005) $, $Revision: 3932 $
043: */
044: public abstract class BaseRule extends ModuleType implements
045: CanonicalizationRule {
046: private static Logger logger = Logger.getLogger(BaseRule.class
047: .getName());
048: public static final String ATTR_ENABLED = "enabled";
049:
050: /**
051: * Constructor.
052: * @param name Name of this canonicalization rule.
053: * @param description Description of what this rule does.
054: */
055: public BaseRule(String name, String description) {
056: super (name, description);
057: setExpertSetting(true);
058: setOverrideable(true);
059: Object[] possibleValues = { Boolean.TRUE, Boolean.FALSE };
060: addElementToDefinition(new SimpleType(ATTR_ENABLED,
061: "Rule is enabled.", new Boolean(true), possibleValues));
062: }
063:
064: public boolean isEnabled(Object context) {
065: boolean result = true;
066: try {
067: Boolean b = (Boolean) getAttribute(context, ATTR_ENABLED);
068: if (b != null) {
069: result = b.booleanValue();
070: }
071: } catch (AttributeNotFoundException e) {
072: logger.warning("Failed get of 'enabled' attribute.");
073: }
074:
075: return result;
076: }
077:
078: /**
079: * Run a regex that strips elements of a string.
080: *
081: * Assumes the regex has a form that wants to strip elements of the passed
082: * string. Assumes that if a match, appending group 1
083: * and group 2 yields desired result.
084: * @param url Url to search in.
085: * @param matcher Matcher whose form yields a group 1 and group 2 if a
086: * match (non-null.
087: * @return Original <code>url</code> else concatenization of group 1
088: * and group 2.
089: */
090: protected String doStripRegexMatch(String url, Matcher matcher) {
091: return (matcher != null && matcher.matches()) ? checkForNull(matcher
092: .group(1))
093: + checkForNull(matcher.group(2))
094: : url;
095: }
096:
097: /**
098: * @param string String to check.
099: * @return <code>string</code> if non-null, else empty string ("").
100: */
101: private String checkForNull(String string) {
102: return (string != null) ? string : "";
103: }
104: }
|