001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. The ASF licenses this file to You
004: * under the Apache License, Version 2.0 (the "License"); you may not
005: * use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License. For additional information regarding
015: * copyright in this work, please see the NOTICE file in the top level
016: * directory of this distribution.
017: */
018: /* Created on Nov 11, 2003 */
019: package org.apache.roller.util;
020:
021: import org.apache.commons.logging.Log;
022: import org.apache.commons.logging.LogFactory;
023: import java.io.BufferedReader;
024: import java.io.FileInputStream;
025: import java.io.IOException;
026: import java.io.InputStream;
027: import java.io.InputStreamReader;
028: import java.io.File;
029: import java.io.FileOutputStream;
030: import java.net.HttpURLConnection;
031: import java.net.URL;
032: import java.text.ParseException;
033: import java.text.SimpleDateFormat;
034: import java.util.ArrayList;
035: import java.util.Date;
036: import java.util.Iterator;
037: import java.util.LinkedList;
038: import java.util.List;
039: import java.util.StringTokenizer;
040: import java.util.regex.Matcher;
041: import java.util.regex.Pattern;
042: import org.apache.roller.config.RollerConfig;
043: import org.apache.commons.lang.StringUtils;
044:
045: /**
046: * Loads MT-Blacklist style blacklist from disk and allows callers to test
047: * strings against the blacklist and (optionally) addition blacklists.
048: * <br />
049: * First looks for blacklist.txt in uploads directory, than in classpath
050: * as /blacklist.txt. Download from web feature disabed.
051: * <br />
052: * Blacklist is formatted one entry per line.
053: * Any line that begins with # is considered to be a comment.
054: * Any line that begins with ( is considered to be a regex expression.
055: * <br />
056: * For more information on the (discontinued) MT-Blacklist service:
057: * http://www.jayallen.org/projects/mt-blacklist.
058: *
059: * @author Lance Lavandowska
060: * @author Allen Gilliland
061: */
062: public class Blacklist {
063:
064: private static Log mLogger = LogFactory.getLog(Blacklist.class);
065:
066: private static Blacklist blacklist;
067: private static final String blacklistFile = "blacklist.txt";
068: private static final String lastUpdateStr = "Last update:";
069:
070: /** We no longer have a blacklist update URL */
071: private static final String blacklistURL = null;
072:
073: private Date lastModified = null;
074: private List blacklistStr = new LinkedList();
075: private List blacklistRegex = new LinkedList();
076:
077: // setup our singleton at class loading time
078: static {
079: mLogger.info("Initializing MT Blacklist");
080: blacklist = new Blacklist();
081: blacklist.loadBlacklistFromFile(null);
082: }
083:
084: /** Hide constructor */
085: private Blacklist() {
086: }
087:
088: /** Singleton factory method. */
089: public static Blacklist getBlacklist() {
090: return blacklist;
091: }
092:
093: /** Updated MT blacklist if necessary. */
094: public static void checkForUpdate() {
095: getBlacklist().update();
096: }
097:
098: /** Non-Static update method. */
099: public void update() {
100: if (this .blacklistURL != null) {
101: boolean blacklist_updated = this .downloadBlacklist();
102: if (blacklist_updated) {
103: this .loadBlacklistFromFile(null);
104: }
105: }
106: }
107:
108: /** Download the MT blacklist from the web to our uploads directory. */
109: private boolean downloadBlacklist() {
110:
111: boolean blacklist_updated = false;
112: try {
113: mLogger.debug("Attempting to download MT blacklist");
114:
115: URL url = new URL(blacklistURL);
116: HttpURLConnection connection = (HttpURLConnection) url
117: .openConnection();
118:
119: // after spending way too much time debugging i've discovered
120: // that the blacklist server is selective based on the User-Agent
121: // header. without this header set i always get a 403 response :(
122: connection.setRequestProperty("User-Agent", "Mozilla/5.0");
123:
124: if (this .lastModified != null) {
125: connection.setRequestProperty("If-Modified-Since",
126: DateUtil.formatRfc822(this .lastModified));
127: }
128:
129: int responseCode = connection.getResponseCode();
130:
131: mLogger.debug("HttpConnection response = " + responseCode);
132:
133: // did the connection return NotModified? If so, no need to parse
134: if (responseCode == HttpURLConnection.HTTP_NOT_MODIFIED) {
135: mLogger.debug("MT blacklist site says we are current");
136: return false;
137: }
138:
139: // did the connection return a LastModified header?
140: long lastModifiedLong = connection.getHeaderFieldDate(
141: "Last-Modified", -1);
142:
143: // if the file is newer than our current then we need do update it
144: if (responseCode == HttpURLConnection.HTTP_OK
145: && (this .lastModified == null || this .lastModified
146: .getTime() < lastModifiedLong)) {
147:
148: mLogger.debug("my last modified = "
149: + this .lastModified.getTime());
150: mLogger.debug("MT last modified = " + lastModifiedLong);
151:
152: // save the new blacklist
153: InputStream instream = connection.getInputStream();
154:
155: String uploadDir = RollerConfig
156: .getProperty("uploads.dir");
157: String path = uploadDir + File.separator
158: + blacklistFile;
159: FileOutputStream outstream = new FileOutputStream(path);
160:
161: mLogger
162: .debug("writing updated MT blacklist to "
163: + path);
164:
165: // read from url and write to file
166: byte[] buf = new byte[4096];
167: int length = 0;
168: while ((length = instream.read(buf)) > 0)
169: outstream.write(buf, 0, length);
170:
171: outstream.close();
172: instream.close();
173:
174: blacklist_updated = true;
175:
176: mLogger.debug("MT blacklist download completed.");
177:
178: } else {
179: mLogger
180: .debug("blacklist *NOT* saved, assuming we are current");
181: }
182:
183: } catch (Exception e) {
184: mLogger.error("error downloading blacklist", e);
185: }
186:
187: return blacklist_updated;
188: }
189:
190: /**
191: * Load the MT blacklist from the file system.
192: * We look for a previously downloaded version of the blacklist first and
193: * if it's not found then we load the default blacklist packed with Roller.
194: * Only public for purposes of unit testing.
195: */
196: public void loadBlacklistFromFile(String blacklistFilePath) {
197:
198: InputStream txtStream = null;
199: try {
200: String path = blacklistFilePath;
201: if (path == null) {
202: String uploadDir = RollerConfig
203: .getProperty("uploads.dir");
204: path = uploadDir + File.separator + blacklistFile;
205: }
206: File blacklistFile = new File(path);
207:
208: // check our lastModified date to see if we need to re-read the file
209: if (this .lastModified != null
210: && this .lastModified.getTime() >= blacklistFile
211: .lastModified()) {
212: mLogger
213: .debug("Blacklist is current, no need to load again");
214: return;
215: } else {
216: this .lastModified = new Date(blacklistFile
217: .lastModified());
218: }
219: txtStream = new FileInputStream(blacklistFile);
220: mLogger.info("Loading blacklist from " + path);
221:
222: } catch (Exception e) {
223: // Roller keeps a copy in the webapp just in case
224: txtStream = getClass().getResourceAsStream(
225: "/" + blacklistFile);
226: mLogger.warn("Couldn't find downloaded blacklist, "
227: + "loading from classpath instead");
228: }
229:
230: if (txtStream != null) {
231: readFromStream(txtStream, false);
232: } else {
233: mLogger
234: .error("Couldn't load a blacklist file from anywhere, "
235: + "this means blacklist checking is disabled for now.");
236: }
237: mLogger.info("Number of blacklist string rules: "
238: + blacklistStr.size());
239: mLogger.info("Number of blacklist regex rules: "
240: + blacklistRegex.size());
241: }
242:
243: /**
244: * Read in the InputStream for rules.
245: * @param txtStream
246: */
247: private String readFromStream(InputStream txtStream,
248: boolean saveStream) {
249: String line;
250: StringBuffer buf = new StringBuffer();
251: BufferedReader in = null;
252: try {
253: in = new BufferedReader(new InputStreamReader(txtStream,
254: "UTF-8"));
255: while ((line = in.readLine()) != null) {
256: if (line.startsWith("#")) {
257: readComment(line);
258: } else {
259: readRule(line);
260: }
261:
262: if (saveStream)
263: buf.append(line).append("\n");
264: }
265: } catch (Exception e) {
266: mLogger.error(e);
267: } finally {
268: try {
269: if (in != null)
270: in.close();
271: } catch (IOException e1) {
272: mLogger.error(e1);
273: }
274: }
275: return buf.toString();
276: }
277:
278: private void readRule(String str) {
279: if (StringUtils.isEmpty(str))
280: return; // bad condition
281:
282: String rule = str.trim();
283:
284: if (str.indexOf("#") > 0) // line has a comment
285: {
286: int commentLoc = str.indexOf("#");
287: rule = str.substring(0, commentLoc - 1).trim(); // strip comment
288: }
289:
290: if (rule.indexOf("(") > -1) // regex rule
291: {
292: // pre-compile patterns since they will be frequently used
293: blacklistRegex.add(Pattern.compile(rule));
294: } else if (StringUtils.isNotEmpty(rule)) {
295: blacklistStr.add(rule);
296: }
297: }
298:
299: /** Read comment and try to parse out "Last update" value */
300: private void readComment(String str) {
301: int lastUpdatePos = str.indexOf(lastUpdateStr);
302: if (lastUpdatePos > -1) {
303: str = str.substring(lastUpdatePos + lastUpdateStr.length());
304: str = str.trim();
305: try {
306: SimpleDateFormat sdf = new SimpleDateFormat(
307: "yyyy/MM/dd HH:mm:ss");
308: lastModified = DateUtil.parse(str, sdf);
309: } catch (ParseException e) {
310: mLogger.debug("ParseException reading " + str);
311: }
312: }
313: }
314:
315: /**
316: * Does the String argument match any of the rules in the built-in blacklist?
317: */
318: public boolean isBlacklisted(String str) {
319: return isBlacklisted(str, null, null);
320: }
321:
322: /**
323: * Does the String argument match any of the rules in the built-in blacklist
324: * plus additional blacklists provided by caller?
325: * @param str String to be checked against blacklist
326: * @param moreStringRules Additional string rules to consider
327: * @param moreRegexRules Additional regex rules to consider
328: */
329: public boolean isBlacklisted(String str, List moreStringRules,
330: List moreRegexRules) {
331: if (str == null || StringUtils.isEmpty(str))
332: return false;
333:
334: // First iterate over blacklist, doing indexOf.
335: // Then iterate over blacklistRegex and test.
336: // As soon as there is a hit in either case return true
337:
338: // test plain String.indexOf
339: List stringRules = blacklistStr;
340: if (moreStringRules != null && moreStringRules.size() > 0) {
341: stringRules = new ArrayList();
342: stringRules.addAll(moreStringRules);
343: stringRules.addAll(blacklistStr);
344: }
345: if (testStringRules(str, stringRules))
346: return true;
347:
348: // test regex blacklisted
349: List regexRules = blacklistRegex;
350: if (moreRegexRules != null && moreRegexRules.size() > 0) {
351: regexRules = new ArrayList();
352: regexRules.addAll(moreRegexRules);
353: regexRules.addAll(blacklistRegex);
354: }
355: return testRegExRules(str, regexRules);
356: }
357:
358: /**
359: * Test string only against rules provided by caller, NOT against built-in blacklist.
360: * @param str String to be checked against rules
361: * @param moreStringRules String rules to consider
362: * @param moreRegexRules Regex rules to consider
363: */
364: public static boolean matchesRulesOnly(String str,
365: List stringRules, List regexRules) {
366: if (testStringRules(str, stringRules))
367: return true;
368: return testRegExRules(str, regexRules);
369: }
370:
371: /** Test String against the RegularExpression rules. */
372: private static boolean testRegExRules(String str, List regexRules) {
373: boolean hit = false;
374: Pattern testPattern = null;
375: Iterator iter = regexRules.iterator();
376: while (iter.hasNext()) {
377: testPattern = (Pattern) iter.next();
378:
379: // want to see what it is matching on, but only in debug mode
380: if (mLogger.isDebugEnabled()) {
381: Matcher matcher = testPattern.matcher(str);
382: if (matcher.find()) {
383: mLogger.debug(matcher.group() + " matched by "
384: + testPattern.pattern());
385: return true;
386: }
387: } else {
388: if (testPattern.matcher(str).find()) {
389: return true;
390: }
391: }
392: }
393: return hit;
394: }
395:
396: /** Test the String against the String rules, using simple indexOf. */
397: private static boolean testStringRules(String str, List stringRules) {
398: String test;
399: Iterator iter = stringRules.iterator();
400: boolean hit = false;
401: while (iter.hasNext()) {
402: test = (String) iter.next();
403: if (str.indexOf(test) > -1) {
404: // want to see what it is matching on, but only in debug mode
405: if (mLogger.isDebugEnabled()) {
406: mLogger.debug("matched:" + test + ":");
407: }
408: return true;
409: }
410: }
411: return hit;
412: }
413:
414: /** Utility method to populate lists based a blacklist in string form */
415: public static void populateSpamRules(String blacklist,
416: List stringRules, List regexRules, String addendum) {
417: String weblogWords = blacklist;
418: weblogWords = (weblogWords == null) ? "" : weblogWords;
419: String siteWords = (addendum != null) ? addendum : "";
420: StringTokenizer toker = new StringTokenizer(siteWords
421: + weblogWords, "\n");
422: while (toker.hasMoreTokens()) {
423: String token = toker.nextToken().trim();
424: if (token.startsWith("#"))
425: continue;
426: if (token.startsWith("(")) {
427: regexRules.add(Pattern.compile(token));
428: } else {
429: stringRules.add(token);
430: }
431: }
432: }
433:
434: /** Return pretty list of String and RegEx rules. */
435: public String toString() {
436: StringBuffer buf = new StringBuffer("blacklist ");
437: buf.append(blacklistStr).append("\n");
438: buf.append("Regex blacklist ").append(blacklistRegex);
439: return buf.toString();
440: }
441: }
|