001: /*
002: * PathSuffixFilter
003: *
004: * $Id: FilePatternFilter.java 4652 2006-09-25 18:41:10Z paul_jack $
005: *
006: * Created on Mar 11, 2004
007: *
008: * Copyright (C) 2004 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.crawler.filter;
028:
029: import java.util.logging.Level;
030: import java.util.logging.Logger;
031:
032: import javax.management.AttributeNotFoundException;
033:
034: import org.archive.crawler.datamodel.CrawlURI;
035: import org.archive.crawler.settings.ComplexType;
036: import org.archive.crawler.settings.MapType;
037: import org.archive.crawler.settings.SimpleType;
038:
039: /**
040: * Compares suffix of a passed CrawlURI, UURI, or String against a regular
041: * expression pattern accepting matches.
042: *
043: * @author Igor Ranitovic
044: * @deprecated As of release 1.10.0. Replaced by
045: * {@link MatchesFilePatternDecideRule}.
046: */
047: public class FilePatternFilter extends URIRegExpFilter {
048:
049: private static final long serialVersionUID = -4019256104085004651L;
050:
051: private static final Logger logger = Logger
052: .getLogger(FilePatternFilter.class.getName());
053: public static final String ATTR_USE_DEFAULT = "use-default-patterns";
054: public static final String IMAGES_PATTERNS = ".*(?i)(\\.(bmp|gif|jpe?g"
055: + "|png|tiff?))$";
056: public static final String AUDIO_PATTERNS = ".*(?i)(\\.(mid|mp2|mp3|mp4"
057: + "|wav))$";
058: public static final String VIDEO_PATTERNS = ".*(?i)(\\.(avi|mov|mpeg|ram"
059: + "|rm|smil|wmv))$";
060: public static final String MISC_PATTERNS = ".*(?i)(\\.(doc|pdf|ppt|swf))$";
061: public static final String ALL_DEFAULT_PATTERNS = ".*(?i)(\\.(bmp|gif"
062: + "|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|rm|smil|wmv"
063: + "|doc|pdf|ppt|swf))$";
064:
065: public static final String ALL = "All";
066: public static final String IMAGES = "Images";
067: public static final String AUDIO = "Audio";
068: public static final String VIDEO = "Video";
069: public static final String MISC = "Miscellaneous";
070: public static final String CUSTOM = "Custom";
071:
072: /**
073: * @param name
074: */
075: public FilePatternFilter(String name) {
076: super (name);
077: setDescription("A URI path suffix filter *Deprecated* Use"
078: + "DecidingFilter and MatchesFilePatternDecideRule instead. "
079: + "All URLs that end with the specified pattern(s) will be added "
080: + "to the scope's focus. Default file patterns are:\n.avi, .bmp, "
081: + ".doc, .gif, .jp(e)g, .mid, .mov, .mp2, .mp3, .mp4, .mpeg, "
082: + ".pdf, .png, .ppt, .ram, .rm,.smil, .swf, .tif(f), .wav, .wmv\n"
083: + "It is also possible to specifiy custom regular expressions "
084: + "for this filter, turning it into (effectively) a generic "
085: + "regular expression filter.");
086:
087: String[] options = new String[] { ALL, IMAGES, AUDIO, VIDEO,
088: MISC, CUSTOM };
089:
090: addElementToDefinition(new SimpleType(
091: ATTR_USE_DEFAULT,
092: "URLs that match selected file "
093: + "patterns will be crawled. Default file patterns are:\n"
094: + "Images: .bmp, .gif, .jp(e)g, .png, .tif(f)\nAudio: .mid, "
095: + ".mp2, .mp3, .mp4, .wav\nVideo: .avi, .mov, .mpeg, .ram, "
096: + ".rm, .smil, .wmv\nMiscellaneous: .doc, .pdf, .ppt, .swf\n"
097: + "All: All above patterns\nChoose 'Custom' to specify your own"
098: + " pattern. These default patterns are case insensitive.",
099: "All", options));
100:
101: addElementToDefinition(new SimpleType(
102: ATTR_REGEXP,
103: "Custom java regular expression.+n "
104: + "This regular expression will be used instead of the "
105: + "supplied pattern groups for matching.\nAn example "
106: + "of such a regular expression (Miscellaneous):\n"
107: + ".*(?i)(\\.(doc|pdf|ppt|swf))$\n"
108: + "Any arbitrary reg.expr. is valid though and will be "
109: + "applied to the URI.", ""));
110:
111: }
112:
113: /**
114: * @see org.archive.crawler.filter.URIRegExpFilter#getRegexp(java.lang.Object)
115: */
116: protected String getRegexp(Object o) {
117: try {
118: String patternType = (String) getAttribute(o,
119: ATTR_USE_DEFAULT);
120:
121: if (patternType.equals(ALL)) {
122: return ALL_DEFAULT_PATTERNS;
123: } else if (patternType.equals(IMAGES)) {
124: return IMAGES_PATTERNS;
125: } else if (patternType.equals(AUDIO)) {
126: return AUDIO_PATTERNS;
127: } else if (patternType.equals(VIDEO)) {
128: return VIDEO_PATTERNS;
129: } else if (patternType.equals(MISC)) {
130: return MISC_PATTERNS;
131: } else if (patternType.equals(CUSTOM)) {
132: return (String) getAttribute(o, ATTR_REGEXP);
133: } else {
134: assert false : "Unrecognized pattern type "
135: + patternType + ". Should never happened!";
136: }
137:
138: } catch (AttributeNotFoundException e) {
139: logger.log(Level.SEVERE, "necessary setting missing", e);
140: }
141: // Basically the filter is inactive if this occurs (The caller
142: // returns 'false' when regexp is null).
143: return null;
144: }
145:
146: /**
147: * @see org.archive.crawler.framework.Filter#accepts(java.lang.Object)
148: */
149: public boolean accepts(Object o) {
150: CrawlURI curi = (o instanceof CrawlURI) ? (CrawlURI) o : null;
151:
152: // Skip the evaluation if the filter is disabled.
153: // Since this filter is primarily used with seed and focus filters
154: // it has to return false when disabled -- unlike Filter's accepts
155: // method.
156: try {
157: if (!((Boolean) getAttribute(ATTR_ENABLED, curi))
158: .booleanValue()) {
159: return false;
160: }
161: } catch (AttributeNotFoundException e) {
162: logger.severe(e.getMessage());
163: }
164:
165: boolean accept = returnTrueIfMatches(curi) == innerAccepts(o);
166:
167: if (accept && logger.isLoggable(Level.FINEST)) {
168: // Log if filter returns true
169: ComplexType p = this .getParent();
170: if (p instanceof MapType) {
171: p = p.getParent();
172: }
173: String msg = this .toString() + " belonging to "
174: + p.toString() + " accepted " + o.toString();
175: logger.finest(msg);
176: }
177:
178: return accept;
179: }
180:
181: }
|