001: /*
002: * MatchesFilePatternDecideRule
003: *
004: * $Id: MatchesFilePatternDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
005: *
006: * Created on Mar 11, 2004
007: *
008: * Copyright (C) 2004 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.crawler.deciderules;
028:
029: import java.util.logging.Logger;
030:
031: import javax.management.AttributeNotFoundException;
032:
033: import org.archive.crawler.settings.SimpleType;
034:
035: /**
036: * Compares suffix of a passed CrawlURI, UURI, or String against a regular
037: * expression pattern, applying its configured decision to all matches.
038: *
039: * Several predefined patterns are available for convenience. Choosing
040: * 'custom' makes this the same as a regular MatchesRegExpDecideRule.
041: *
042: * @author Igor Ranitovic
043: */
044: public class MatchesFilePatternDecideRule extends
045: MatchesRegExpDecideRule {
046:
047: private static final long serialVersionUID = -4182743018517062411L;
048:
049: private static final Logger logger = Logger
050: .getLogger(MatchesFilePatternDecideRule.class.getName());
051: public static final String ATTR_USE_PRESET = "use-preset-pattern";
052: public static final String IMAGES_PATTERNS = ".*(?i)(\\.(bmp|gif|jpe?g|png|tiff?))$";
053: public static final String AUDIO_PATTERNS = ".*(?i)(\\.(mid|mp2|mp3|mp4|wav))$";
054: public static final String VIDEO_PATTERNS = ".*(?i)(\\.(avi|mov|mpeg|ram|rm|smil|wmv))$";
055: public static final String MISC_PATTERNS = ".*(?i)(\\.(doc|pdf|ppt|swf))$";
056: public static final String ALL_DEFAULT_PATTERNS = ".*(?i)(\\.(bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg"
057: + "|ram|rm|smil|wmv|doc|pdf|ppt|swf))$";
058:
059: public static final String ALL = "All";
060: public static final String IMAGES = "Images";
061: public static final String AUDIO = "Audio";
062: public static final String VIDEO = "Video";
063: public static final String MISC = "Miscellaneous";
064: public static final String CUSTOM = "Custom";
065:
066: /**
067: * Usual constructor.
068: * @param name
069: */
070: public MatchesFilePatternDecideRule(String name) {
071: super (name);
072: setDescription("MatchesFilePatternDecideRule. Applies its decision "
073: + "to all URIs that end with the specified pattern(s). Anything "
074: + " that does not match is let PASS. "
075: + " Default file patterns are: .avi, .bmp, "
076: + ".doc, .gif, .jp(e)g, .mid, .mov, .mp2, .mp3, .mp4, .mpeg, "
077: + ".pdf, .png, .ppt, .ram, .rm,.smil, .swf, .tif(f), .wav, .wmv. "
078: + "It is also possible to specify a custom regular expression, "
079: + "in which case this behaves exactly like the "
080: + " MatchesRegExpDecideRule. See also "
081: + "NotMatchesFilePatternDecideRule.");
082:
083: String[] options = new String[] { ALL, IMAGES, AUDIO, VIDEO,
084: MISC, CUSTOM };
085:
086: addElementToDefinition(new SimpleType(
087: ATTR_USE_PRESET,
088: "URIs that match selected file "
089: + "patterns will have the decision applied. Default file "
090: + "patterns are:\n"
091: + "Images: .bmp, .gif, .jp(e)g, .png, .tif(f)\n"
092: + "Audio: .mid, mp2, .mp3, .mp4, .wav\n"
093: + "Video: .avi, .mov, .mpeg, .ram, .rm, .smil, .wmv\n"
094: + "Miscellaneous: .doc, .pdf, .ppt, .swf\n"
095: + "All: All above patterns\n"
096: + "Choose 'Custom' to specify your own pattern. Preset "
097: + "patterns are case insensitive.", "All",
098: options));
099:
100: addElementToDefinition(new SimpleType(
101: ATTR_REGEXP,
102: "Custom java regular expression. "
103: + "This regular expression will be used instead of the "
104: + "supplied pattern groups for matching. An example "
105: + "of such a regular expression (Miscellaneous): "
106: + ".*(?i)(\\.(doc|pdf|ppt|swf))$ "
107: + "Any arbitrary regular expression may be entered and "
108: + "will be applied to the URI.", ""));
109: }
110:
111: /**
112: * Use a preset if configured to do so.
113: * @param o Context
114: * @return Regex to use.
115: *
116: * @see org.archive.crawler.filter.URIRegExpFilter#getRegexp(Object)
117: */
118: protected String getRegexp(Object o) {
119: try {
120: String patternType = (String) getAttribute(o,
121: ATTR_USE_PRESET);
122: if (patternType.equals(ALL)) {
123: return ALL_DEFAULT_PATTERNS;
124: } else if (patternType.equals(IMAGES)) {
125: return IMAGES_PATTERNS;
126: } else if (patternType.equals(AUDIO)) {
127: return AUDIO_PATTERNS;
128: } else if (patternType.equals(VIDEO)) {
129: return VIDEO_PATTERNS;
130: } else if (patternType.equals(MISC)) {
131: return MISC_PATTERNS;
132: } else if (patternType.equals(CUSTOM)) {
133: return super .getRegexp(o);
134: } else {
135: assert false : "Unrecognized pattern type "
136: + patternType + ". Should never happen!";
137: }
138: } catch (AttributeNotFoundException e) {
139: logger.severe(e.getMessage());
140: }
141: return null; // Basically the rule is inactive if this occurs.
142: }
143: }
|