001: /* Canonicalizer
002: *
003: * Created on Oct 7, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.url;
024:
025: import java.util.Iterator;
026: import java.util.logging.Logger;
027: import java.util.logging.Level;
028:
029: import javax.management.AttributeNotFoundException;
030:
031: import org.archive.crawler.datamodel.CrawlOrder;
032: import org.archive.crawler.settings.MapType;
033: import org.archive.net.UURI;
034:
035: /**
036: * URL canonicalizer.
037: * @author stack
038: * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
039: */
040: public class Canonicalizer {
041: private static Logger logger = Logger.getLogger(Canonicalizer.class
042: .getName());
043:
044: /**
045: * Constructor.
046: * This class can't be constructed.
047: * Shutdown.
048: */
049: private Canonicalizer() {
050: super ();
051: }
052:
053: /**
054: * Convenience method that is passed a settings object instance pulling
055: * from it what it needs to canonicalize.
056: * @param uuri UURI to canonicalize.
057: * @param order A crawlorder instance.
058: * @return Canonicalized string of uuri else uuri if an error.
059: */
060: public static String canonicalize(UURI uuri, CrawlOrder order) {
061: MapType rules = null;
062: String canonical = uuri.toString();
063: try {
064: rules = (MapType) order.getAttribute(uuri,
065: CrawlOrder.ATTR_RULES);
066: canonical = Canonicalizer.canonicalize(uuri, rules
067: .iterator(uuri));
068: } catch (AttributeNotFoundException e) {
069: logger.warning("Failed canonicalization of " + canonical
070: + ": " + e);
071: }
072: return canonical;
073: }
074:
075: /**
076: * Run the passed uuri through the list of rules.
077: * @param uuri Url to canonicalize.
078: * @param rules Iterator of canonicalization rules to apply (Get one
079: * of these on the url-canonicalizer-rules element in order files or
080: * create a list externally). Rules must implement the Rule interface.
081: * @return Canonicalized URL.
082: */
083: public static String canonicalize(UURI uuri, Iterator rules) {
084: String before = uuri.toString();
085: //String beforeRule = null;
086: String canonical = before;
087: for (; rules.hasNext();) {
088: CanonicalizationRule r = (CanonicalizationRule) rules
089: .next();
090: //if (logger.isLoggable(Level.FINER)) {
091: // beforeRule = canonical;
092: //}
093: if (!r.isEnabled(uuri)) {
094: if (logger.isLoggable(Level.FINER)) {
095: logger.finer("Rule " + r.getName()
096: + " is disabled.");
097: }
098: continue;
099: }
100: canonical = r.canonicalize(canonical, uuri);
101: if (logger.isLoggable(Level.FINER)) {
102: logger.finer("Rule " + r.getName() + " " + before
103: + " => " + canonical);
104: }
105: }
106: if (logger.isLoggable(Level.INFO)) {
107: logger.fine(before + " => " + canonical);
108: }
109: return canonical;
110: }
111: }
|