001: package net.matuschek.spider;
002:
003: /************************************************
004: Copyright (c) 2001/2002 by Daniel Matuschek
005: *************************************************/
006:
007: import java.io.BufferedReader;
008: import java.io.IOException;
009: import java.io.Reader;
010: import java.net.URL;
011: import java.util.StringTokenizer;
012: import java.util.Vector;
013:
014: import org.apache.regexp.RESyntaxException;
015:
016: /**
017: * This URLChecker checks a URL using a list of regular expressions
018: * that should be allowed or denied.
019: *
020: * @author Daniel Matuschek
021: * @version $Revision: 1.4 $
022: */
023: public class RegExpURLCheck implements URLCheck {
024: /** vector to store the rules */
025: private Vector<RegExpRule> rules = null;
026:
027: /** default check result if no matching regexp was found */
028: private boolean defaultResult = true;
029:
030: /** initializes the object with an empty rule set */
031: public RegExpURLCheck() {
032: rules = new Vector<RegExpRule>();
033: }
034:
035: /**
036: * <p>initialized the object with a rule set from an
037: * input stream (e.g. a file)</p>
038: *
039: * <p>every line of this stream has the format
040: * <code>allow|deny expression</code></p>
041: *
042: * <p>default value can be set with
043: * <code>allow|deny .</code> at the end of the file</p>
044: *
045: * <p>lines that start with "#" and empty lines will be
046: * ignored</p>
047: */
048: public RegExpURLCheck(Reader r) throws IOException,
049: org.apache.regexp.RESyntaxException {
050: this ();
051:
052: BufferedReader reader = new BufferedReader(r);
053:
054: String line = "";
055: int lineno = 0;
056:
057: while (line != null) {
058: line = reader.readLine();
059: lineno++;
060:
061: if ((line != null) && (!line.trim().equals(""))
062: && (!line.startsWith("#"))) {
063: StringTokenizer st = new StringTokenizer(line);
064: // did we get 2 tokens ?
065: if (st.countTokens() != 2) {
066: throw new IOException("line " + lineno
067: + " don't consists of 2 fields");
068: }
069:
070: String allowStr = st.nextToken();
071: boolean allow = true;
072: String expression = st.nextToken();
073:
074: // allow or deny ?
075: if (allowStr.equalsIgnoreCase("allow")) {
076: allow = true;
077: } else if (allowStr.equalsIgnoreCase("deny")) {
078: allow = false;
079: } else {
080: throw new IOException("first token in line "
081: + lineno + " has to be allow or deny");
082: }
083:
084: addRule(expression, allow);
085: }
086: }
087: }
088:
089: /**
090: * Sets the default result that will be returned if no matching
091: * regular expression was found
092: * @param default the default result
093: */
094: public void setDefaultResult(boolean defaultResult) {
095: this .defaultResult = defaultResult;
096: }
097:
098: /**
099: * Gets the default result that will be returned if no matching
100: * regular expression was found
101: * @return the default result
102: */
103: public boolean getDefaultResult() {
104: return defaultResult;
105: }
106:
107: /**
108: * Gets the list of rules
109: * @return a vector of RegExpRule objects
110: */
111: public Vector getRules() {
112: return rules;
113: }
114:
115: /**
116: * Sets the list of rules
117: * @param rules a vector of RegExpRule objects
118: */
119: public void setRules(Vector<RegExpRule> rules) {
120: this .rules = rules;
121: }
122:
123: /**
124: * adds a allow or deny rule
125: * @param regExp a String containing the regular expression
126: * @param allow allow (TRUE) or deny (FALSE)
127: */
128: public void addRule(String regExp, boolean allow)
129: throws RESyntaxException {
130: RegExpRule rule = new RegExpRule();
131: rule.setPattern(regExp);
132: rule.setAllow(allow);
133: rules.add(rule);
134: }
135:
136: /**
137: * Checks if a given URL is allowed or denied by the rules
138: *
139: * @return true if a matching "allow" rule was found,
140: * false if a matching "deny" rule was found,
141: * the default value if no rule was found
142: * @see #setDefaultResult(boolean)
143: */
144: public boolean checkURL(URL u) {
145: String urlStr = u.toString();
146:
147: for (int i = 0; i < rules.size(); i++) {
148: RegExpRule rule = rules.elementAt(i);
149:
150: if (rule.match(urlStr)) {
151: return rule.getAllow();
152: }
153: }
154:
155: return defaultResult;
156: }
157:
158: /**
159: * Checks if a given URL is allowed or denied by the rules for processing
160: *
161: * @return true if a matching "allow" rule was found,
162: * false if a matching "deny" rule was found,
163: * the default value if no rule was found
164: * @see #setDefaultResult(boolean)
165: */
166: public boolean checkURLForProcessing(URL u) {
167: String urlStr = u.toString();
168:
169: for (int i = 0; i < rules.size(); i++) {
170: RegExpRule rule = rules.elementAt(i);
171:
172: if (rule.match(urlStr)) {
173: return rule.getProcessAllowed();
174: }
175: }
176:
177: return defaultResult;
178: }
179:
180: } // RegExpURLCheck
|