0001: /*
0002: JSPWiki - a JSP-based WikiWiki clone.
0003:
0004: Copyright (C) 2001-2005 Janne Jalkanen (Janne.Jalkanen@iki.fi)
0005:
0006: This program is free software; you can redistribute it and/or modify
0007: it under the terms of the GNU Lesser General Public License as published by
0008: the Free Software Foundation; either version 2.1 of the License, or
0009: (at your option) any later version.
0010:
0011: This program is distributed in the hope that it will be useful,
0012: but WITHOUT ANY WARRANTY; without even the implied warranty of
0013: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0014: GNU Lesser General Public License for more details.
0015:
0016: You should have received a copy of the GNU Lesser General Public License
0017: along with this program; if not, write to the Free Software
0018: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0019: */
0020: package com.ecyrd.jspwiki.filters;
0021:
0022: import java.io.*;
0023: import java.util.*;
0024:
0025: import javax.servlet.http.HttpServletRequest;
0026: import javax.servlet.http.HttpServletResponse;
0027: import javax.servlet.jsp.PageContext;
0028:
0029: import net.sf.akismet.Akismet;
0030:
0031: import org.apache.commons.jrcs.diff.*;
0032: import org.apache.commons.jrcs.diff.myers.MyersDiff;
0033: import org.apache.commons.lang.time.StopWatch;
0034: import org.apache.log4j.Logger;
0035: import org.apache.oro.text.regex.*;
0036:
0037: import com.ecyrd.jspwiki.*;
0038: import com.ecyrd.jspwiki.attachment.Attachment;
0039: import com.ecyrd.jspwiki.auth.user.UserProfile;
0040: import com.ecyrd.jspwiki.providers.ProviderException;
0041: import com.ecyrd.jspwiki.ui.EditorManager;
0042:
0043: /**
0044: * This is Herb, the JSPWiki spamfilter that can also do choke modifications.
0045: *
0046: * Parameters:
0047: * <ul>
0048: * <li>wordlist - Page name where the regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on
0049: * that page. Default is "SpamFilterWordList".
0050: * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
0051: * "SpamFilterWordList/blacklist.txt"</li>
0052: * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage".
0053: * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li>
0054: * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li>
0055: * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
0056: * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
0057: * <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
0058: * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
0059: * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li>
0060: * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable
0061: * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
0062: * and calculates a score for the spam, which is then compared to a filter level value.
0063: * </ul>
0064: *
0065: * <p>Changes by admin users are ignored in any case.</p>
0066: *
0067: * @since 2.1.112
0068: * @author Janne Jalkanen
0069: */
0070: public class SpamFilter extends BasicPageFilter {
0071: private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
0072: private static final String REASON_REGEXP = "Regexp";
0073: private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
0074: private static final String REASON_BOT_TRAP = "BotTrap";
0075: private static final String REASON_AKISMET = "Akismet";
0076: private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
0077: private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
0078: private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
0079: private static final String REASON_UTF8_TRAP = "UTF8Trap";
0080:
0081: private static final String LISTVAR = "spamwords";
0082: public static final String PROP_WORDLIST = "wordlist";
0083: public static final String PROP_ERRORPAGE = "errorpage";
0084: public static final String PROP_PAGECHANGES = "pagechangesinminute";
0085: public static final String PROP_SIMILARCHANGES = "similarchanges";
0086: public static final String PROP_BANTIME = "bantime";
0087: public static final String PROP_BLACKLIST = "blacklist";
0088: public static final String PROP_MAXURLS = "maxurls";
0089: public static final String PROP_AKISMET_API_KEY = "akismet-apikey";
0090: public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated";
0091: public static final String PROP_CAPTCHA = "captcha";
0092: public static final String PROP_FILTERSTRATEGY = "strategy";
0093:
0094: public static final String STRATEGY_EAGER = "eager";
0095: public static final String STRATEGY_SCORE = "score";
0096:
0097: private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
0098:
0099: private String m_forbiddenWordsPage = "SpamFilterWordList";
0100: private String m_errorPage = "RejectedMessage";
0101: private String m_blacklist = "SpamFilterWordList/blacklist.txt";
0102:
0103: private PatternMatcher m_matcher = new Perl5Matcher();
0104: private PatternCompiler m_compiler = new Perl5Compiler();
0105:
0106: private Collection m_spamPatterns = null;
0107:
0108: private Date m_lastRebuild = new Date(0L);
0109:
0110: static Logger spamlog = Logger.getLogger("SpamLog");
0111: static Logger log = Logger.getLogger(SpamFilter.class);
0112:
0113: private Vector m_temporaryBanList = new Vector();
0114:
0115: private int m_banTime = 60; // minutes
0116:
0117: private Vector m_lastModifications = new Vector();
0118:
0119: /**
0120: * How many times a single IP address can change a page per minute?
0121: */
0122: private int m_limitSinglePageChanges = 5;
0123:
0124: /**
0125: * How many times can you add the exact same string to a page?
0126: */
0127: private int m_limitSimilarChanges = 2;
0128:
0129: /**
0130: * How many URLs can be added at maximum.
0131: */
0132: private int m_maxUrls = 10;
0133:
0134: private Pattern m_urlPattern;
0135: private Akismet m_akismet;
0136:
0137: private String m_akismetAPIKey = null;
0138:
0139: private boolean m_useCaptcha = false;
0140:
0141: /** The limit at which we consider something to be spam. */
0142: private int m_scoreLimit = 1;
0143:
0144: /**
0145: * If set to true, will ignore anyone who is in Authenticated role.
0146: */
0147: private boolean m_ignoreAuthenticated = false;
0148:
0149: private boolean m_stopAtFirstMatch = true;
0150:
0151: public void initialize(WikiEngine engine, Properties properties) {
0152: m_forbiddenWordsPage = properties.getProperty(PROP_WORDLIST,
0153: m_forbiddenWordsPage);
0154: m_errorPage = properties.getProperty(PROP_ERRORPAGE,
0155: m_errorPage);
0156:
0157: m_limitSinglePageChanges = TextUtil.getIntegerProperty(
0158: properties, PROP_PAGECHANGES, m_limitSinglePageChanges);
0159:
0160: m_limitSimilarChanges = TextUtil.getIntegerProperty(properties,
0161: PROP_SIMILARCHANGES, m_limitSimilarChanges);
0162:
0163: m_maxUrls = TextUtil.getIntegerProperty(properties,
0164: PROP_MAXURLS, m_maxUrls);
0165:
0166: m_banTime = TextUtil.getIntegerProperty(properties,
0167: PROP_BANTIME, m_banTime);
0168:
0169: m_blacklist = properties.getProperty(PROP_BLACKLIST,
0170: m_blacklist);
0171:
0172: m_ignoreAuthenticated = TextUtil.getBooleanProperty(properties,
0173: PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated);
0174:
0175: m_useCaptcha = properties.getProperty(PROP_CAPTCHA, "").equals(
0176: "asirra");
0177:
0178: try {
0179: m_urlPattern = m_compiler.compile(URL_REGEXP);
0180: } catch (MalformedPatternException e) {
0181: log.fatal(
0182: "Internal error: Someone put in a faulty pattern.",
0183: e);
0184: throw new InternalWikiException("Faulty pattern.");
0185: }
0186:
0187: m_akismetAPIKey = TextUtil.getStringProperty(properties,
0188: PROP_AKISMET_API_KEY, m_akismetAPIKey);
0189:
0190: m_stopAtFirstMatch = TextUtil.getStringProperty(properties,
0191: PROP_FILTERSTRATEGY, STRATEGY_EAGER).equals(
0192: STRATEGY_EAGER);
0193:
0194: log.info("# Spam filter initialized. Temporary ban time "
0195: + m_banTime + " mins, max page changes/minute: "
0196: + m_limitSinglePageChanges);
0197:
0198: }
0199:
0200: private static final int REJECT = 0;
0201: private static final int ACCEPT = 1;
0202: private static final int NOTE = 2;
0203:
0204: private static String log(WikiContext ctx, int type, String source,
0205: String message) {
0206: message = TextUtil.replaceString(message, "\r\n", "\\r\\n");
0207: message = TextUtil.replaceString(message, "\"", "\\\"");
0208:
0209: String uid = getUniqueID();
0210:
0211: String page = ctx.getPage().getName();
0212: String reason = "UNKNOWN";
0213: String addr = ctx.getHttpRequest() != null ? ctx
0214: .getHttpRequest().getRemoteAddr() : "-";
0215:
0216: switch (type) {
0217: case REJECT:
0218: reason = "REJECTED";
0219: break;
0220: case ACCEPT:
0221: reason = "ACCEPTED";
0222: break;
0223: case NOTE:
0224: reason = "NOTE";
0225: break;
0226: default:
0227: throw new InternalWikiException("Illegal type " + type);
0228: }
0229:
0230: spamlog.info(reason + " " + source + " " + uid + " " + addr
0231: + " \"" + page + "\" " + message);
0232:
0233: return uid;
0234: }
0235:
0236: public String preSave(WikiContext context, String content)
0237: throws RedirectException {
0238: cleanBanList();
0239: refreshBlacklists(context);
0240:
0241: String change = getChange(context, content);
0242:
0243: if (!ignoreThisUser(context)) {
0244: checkBanList(context, change);
0245: checkSinglePageChange(context, content, change);
0246: checkPatternList(context, content, change);
0247: }
0248:
0249: if (!m_stopAtFirstMatch) {
0250: Integer score = (Integer) context
0251: .getVariable(ATTR_SPAMFILTER_SCORE);
0252:
0253: if (score != null && score.intValue() >= m_scoreLimit) {
0254: throw new RedirectException(
0255: "Herb says you got too many points",
0256: getRedirectPage(context));
0257: }
0258: }
0259:
0260: log(context, ACCEPT, "-", change);
0261: return content;
0262: }
0263:
0264: private void checkStrategy(WikiContext context, String error,
0265: String message) throws RedirectException {
0266: if (m_stopAtFirstMatch) {
0267: throw new RedirectException(message,
0268: getRedirectPage(context));
0269: }
0270:
0271: Integer score = (Integer) context
0272: .getVariable(ATTR_SPAMFILTER_SCORE);
0273:
0274: if (score != null)
0275: score = new Integer(score.intValue() + 1);
0276: else
0277: score = new Integer(1);
0278:
0279: context.setVariable(ATTR_SPAMFILTER_SCORE, score);
0280: }
0281:
0282: /**
0283: * Parses a list of patterns and returns a Collection of compiled Pattern
0284: * objects.
0285: *
0286: * @param source
0287: * @param list
0288: * @return
0289: */
0290: private Collection parseWordList(WikiPage source, String list) {
0291: ArrayList compiledpatterns = new ArrayList();
0292:
0293: if (list != null) {
0294: StringTokenizer tok = new StringTokenizer(list, " \t\n");
0295:
0296: while (tok.hasMoreTokens()) {
0297: String pattern = tok.nextToken();
0298:
0299: try {
0300: compiledpatterns.add(m_compiler.compile(pattern));
0301: } catch (MalformedPatternException e) {
0302: log.debug("Malformed spam filter pattern "
0303: + pattern);
0304:
0305: source.setAttribute("error",
0306: "Malformed spam filter pattern " + pattern);
0307: }
0308: }
0309: }
0310:
0311: return compiledpatterns;
0312: }
0313:
0314: /**
0315: * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled
0316: * Pattern objects.
0317: *
0318: * @param list
0319: * @return
0320: */
0321: private Collection parseBlacklist(String list) {
0322: ArrayList compiledpatterns = new ArrayList();
0323:
0324: if (list != null) {
0325: try {
0326: BufferedReader in = new BufferedReader(
0327: new StringReader(list));
0328:
0329: String line;
0330:
0331: while ((line = in.readLine()) != null) {
0332: line = line.trim();
0333: if (line.length() == 0)
0334: continue; // Empty line
0335: if (line.startsWith("#"))
0336: continue; // It's a comment
0337:
0338: int ws = line.indexOf(' ');
0339:
0340: if (ws == -1)
0341: ws = line.indexOf('\t');
0342:
0343: if (ws != -1)
0344: line = line.substring(0, ws);
0345:
0346: try {
0347: compiledpatterns.add(m_compiler.compile(line));
0348: } catch (MalformedPatternException e) {
0349: log.debug("Malformed spam filter pattern "
0350: + line);
0351: }
0352: }
0353: } catch (IOException e) {
0354: log
0355: .info(
0356: "Could not read patterns; returning what I got",
0357: e);
0358: }
0359: }
0360:
0361: return compiledpatterns;
0362: }
0363:
0364: /**
0365: * Takes a single page change and performs a load of tests on the content change.
0366: * An admin can modify anything.
0367: *
0368: * @param context
0369: * @param content
0370: * @throws RedirectException
0371: */
0372: private synchronized void checkSinglePageChange(
0373: WikiContext context, String content, String change)
0374: throws RedirectException {
0375: HttpServletRequest req = context.getHttpRequest();
0376:
0377: if (req != null) {
0378: String addr = req.getRemoteAddr();
0379: int hostCounter = 0;
0380: int changeCounter = 0;
0381:
0382: log.debug("Change is " + change);
0383:
0384: long time = System.currentTimeMillis() - 60 * 1000L; // 1 minute
0385:
0386: for (Iterator i = m_lastModifications.iterator(); i
0387: .hasNext();) {
0388: Host host = (Host) i.next();
0389:
0390: //
0391: // Check if this item is invalid
0392: //
0393: if (host.getAddedTime() < time) {
0394: log.debug("Removed host " + host.getAddress()
0395: + " from modification queue (expired)");
0396: i.remove();
0397: continue;
0398: }
0399:
0400: //
0401: // Check if this IP address has been seen before
0402: //
0403:
0404: if (host.getAddress().equals(addr)) {
0405: hostCounter++;
0406: }
0407:
0408: //
0409: // Check, if this change has been seen before
0410: //
0411:
0412: if (host.getChange() != null
0413: && host.getChange().equals(change)) {
0414: changeCounter++;
0415: }
0416: }
0417:
0418: //
0419: // Now, let's check against the limits.
0420: //
0421: if (hostCounter >= m_limitSinglePageChanges) {
0422: Host host = new Host(addr, null);
0423:
0424: m_temporaryBanList.add(host);
0425:
0426: String uid = log(context, REJECT,
0427: REASON_TOO_MANY_MODIFICATIONS, change);
0428: log
0429: .info("SPAM:TooManyModifications ("
0430: + uid
0431: + "). Added host "
0432: + addr
0433: + " to temporary ban list for doing too many modifications/minute");
0434: checkStrategy(context, REASON_TOO_MANY_MODIFICATIONS,
0435: "Herb says you look like a spammer, and I trust Herb! (Incident code "
0436: + uid + ")");
0437: }
0438:
0439: if (changeCounter >= m_limitSimilarChanges) {
0440: Host host = new Host(addr, null);
0441:
0442: m_temporaryBanList.add(host);
0443:
0444: String uid = log(context, REJECT,
0445: REASON_SIMILAR_MODIFICATIONS, change);
0446:
0447: log
0448: .info("SPAM:SimilarModifications ("
0449: + uid
0450: + "). Added host "
0451: + addr
0452: + " to temporary ban list for doing too many similar modifications");
0453: checkStrategy(context, REASON_SIMILAR_MODIFICATIONS,
0454: "Herb says you look like a spammer, and I trust Herb! (Incident code "
0455: + uid + ")");
0456: }
0457:
0458: //
0459: // Calculate the number of links in the addition.
0460: //
0461:
0462: String tstChange = change;
0463: int urlCounter = 0;
0464:
0465: while (m_matcher.contains(tstChange, m_urlPattern)) {
0466: MatchResult m = m_matcher.getMatch();
0467:
0468: tstChange = tstChange.substring(m.endOffset(0));
0469:
0470: urlCounter++;
0471: }
0472:
0473: if (urlCounter > m_maxUrls) {
0474: Host host = new Host(addr, null);
0475:
0476: m_temporaryBanList.add(host);
0477:
0478: String uid = log(context, REJECT, REASON_TOO_MANY_URLS,
0479: change);
0480:
0481: log
0482: .info("SPAM:TooManyUrls ("
0483: + uid
0484: + "). Added host "
0485: + addr
0486: + " to temporary ban list for adding too many URLs");
0487: checkStrategy(context, REASON_TOO_MANY_URLS,
0488: "Herb says you look like a spammer, and I trust Herb! (Incident code "
0489: + uid + ")");
0490: }
0491:
0492: //
0493: // Check bot trap
0494: //
0495:
0496: checkBotTrap(context, change);
0497:
0498: //
0499: // Check UTF-8 mangling
0500: //
0501:
0502: checkUTF8(context, change);
0503:
0504: //
0505: // Do Akismet check. This is good to be the last, because this is the most
0506: // expensive operation.
0507: //
0508:
0509: checkAkismet(context, change);
0510:
0511: m_lastModifications.add(new Host(addr, change));
0512: }
0513: }
0514:
0515: /**
0516: * Checks against the akismet system.
0517: *
0518: * @param context
0519: * @param change
0520: * @throws RedirectException
0521: */
0522: private void checkAkismet(WikiContext context, String change)
0523: throws RedirectException {
0524: if (m_akismetAPIKey != null) {
0525: if (m_akismet == null) {
0526: log.info("Initializing Akismet spam protection.");
0527:
0528: m_akismet = new Akismet(m_akismetAPIKey, context
0529: .getEngine().getBaseURL());
0530:
0531: if (!m_akismet.verifyAPIKey()) {
0532: log
0533: .error("Akismet API key cannot be verified. Please check your config.");
0534: m_akismetAPIKey = null;
0535: m_akismet = null;
0536: }
0537: }
0538:
0539: HttpServletRequest req = context.getHttpRequest();
0540:
0541: if (req != null && m_akismet != null) {
0542: log.debug("Calling Akismet to check for spam...");
0543:
0544: StopWatch sw = new StopWatch();
0545: sw.start();
0546:
0547: String ipAddress = req.getRemoteAddr();
0548: String userAgent = req.getHeader("User-Agent");
0549: String referrer = req.getHeader("Referer");
0550: String permalink = context.getViewURL(context.getPage()
0551: .getName());
0552: String commentType = context.getRequestContext()
0553: .equals(WikiContext.COMMENT) ? "comment"
0554: : "edit";
0555: String commentAuthor = context.getCurrentUser()
0556: .getName();
0557: String commentAuthorEmail = null;
0558: String commentAuthorURL = null;
0559:
0560: boolean isSpam = m_akismet.commentCheck(ipAddress,
0561: userAgent, referrer, permalink, commentType,
0562: commentAuthor, commentAuthorEmail,
0563: commentAuthorURL, change, null);
0564:
0565: sw.stop();
0566:
0567: log.debug("Akismet request done in: " + sw);
0568:
0569: if (isSpam) {
0570: // Host host = new Host( ipAddress, null );
0571:
0572: // m_temporaryBanList.add( host );
0573:
0574: String uid = log(context, REJECT, REASON_AKISMET,
0575: change);
0576:
0577: log
0578: .info("SPAM:Akismet ("
0579: + uid
0580: + "). Akismet thinks this change is spam; added host to temporary ban list.");
0581:
0582: checkStrategy(
0583: context,
0584: REASON_AKISMET,
0585: "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code "
0586: + uid + ")");
0587: }
0588: }
0589: }
0590: }
0591:
0592: /**
0593: * Returns a static string which can be used to detect spambots which
0594: * just wildly fill in all the fields.
0595: *
0596: * @return A string
0597: */
0598: public static String getBotFieldName() {
0599: return "submit_auth";
0600: }
0601:
0602: /**
0603: * This checks whether an invisible field is available in the request, and
0604: * whether it's contents are suspected spam.
0605: *
0606: * @param context
0607: * @param change
0608: * @throws RedirectException
0609: */
0610: private void checkBotTrap(WikiContext context, String change)
0611: throws RedirectException {
0612: HttpServletRequest request = context.getHttpRequest();
0613:
0614: if (request != null) {
0615: String unspam = request.getParameter(getBotFieldName());
0616: if (unspam != null && unspam.length() > 0) {
0617: String uid = log(context, REJECT, REASON_BOT_TRAP,
0618: change);
0619:
0620: log.info("SPAM:BotTrap (" + uid
0621: + "). Wildly behaving bot detected.");
0622:
0623: checkStrategy(context, REASON_BOT_TRAP,
0624: "Spamming attempt detected. (Incident code "
0625: + uid + ")");
0626:
0627: }
0628: }
0629: }
0630:
0631: private void checkUTF8(WikiContext context, String change)
0632: throws RedirectException {
0633: HttpServletRequest request = context.getHttpRequest();
0634:
0635: if (request != null) {
0636: String utf8field = request.getParameter("encodingcheck");
0637:
0638: if (utf8field != null && !utf8field.equals("\u3041")) {
0639: String uid = log(context, REJECT, REASON_UTF8_TRAP,
0640: change);
0641:
0642: log.info("SPAM:UTF8Trap (" + uid
0643: + "). Wildly posting dumb bot detected.");
0644:
0645: checkStrategy(context, REASON_UTF8_TRAP,
0646: "Spamming attempt detected. (Incident code "
0647: + uid + ")");
0648: }
0649: }
0650: }
0651:
0652: /**
0653: * Goes through the ban list and cleans away any host which has expired from it.
0654: */
0655: private synchronized void cleanBanList() {
0656: long now = System.currentTimeMillis();
0657:
0658: for (Iterator i = m_temporaryBanList.iterator(); i.hasNext();) {
0659: Host host = (Host) i.next();
0660:
0661: if (host.getReleaseTime() < now) {
0662: log.debug("Removed host " + host.getAddress()
0663: + " from temporary ban list (expired)");
0664: i.remove();
0665: }
0666: }
0667: }
0668:
0669: /**
0670: * Checks the ban list if the IP address of the changer is already on it.
0671: *
0672: * @param context
0673: * @throws RedirectException
0674: */
0675:
0676: private void checkBanList(WikiContext context, String change)
0677: throws RedirectException {
0678: HttpServletRequest req = context.getHttpRequest();
0679:
0680: if (req != null) {
0681: String remote = req.getRemoteAddr();
0682:
0683: long now = System.currentTimeMillis();
0684:
0685: for (Iterator i = m_temporaryBanList.iterator(); i
0686: .hasNext();) {
0687: Host host = (Host) i.next();
0688:
0689: if (host.getAddress().equals(remote)) {
0690: long timeleft = (host.getReleaseTime() - now) / 1000L;
0691:
0692: log(context, REJECT, REASON_IP_BANNED_TEMPORARILY,
0693: change);
0694:
0695: checkStrategy(context,
0696: REASON_IP_BANNED_TEMPORARILY,
0697: "You have been temporarily banned from modifying this wiki. ("
0698: + timeleft
0699: + " seconds of ban left)");
0700: }
0701: }
0702: }
0703:
0704: }
0705:
0706: /**
0707: * If the spam filter notices changes in the black list page, it will refresh
0708: * them automatically.
0709: *
0710: * @param context
0711: */
0712: private void refreshBlacklists(WikiContext context) {
0713: try {
0714: WikiPage source = context.getEngine().getPage(
0715: m_forbiddenWordsPage);
0716: Attachment att = context.getEngine().getAttachmentManager()
0717: .getAttachmentInfo(context, m_blacklist);
0718:
0719: boolean rebuild = false;
0720:
0721: //
0722: // Rebuild, if the page or the attachment has changed since.
0723: //
0724: if (source != null) {
0725: if (m_spamPatterns == null
0726: || m_spamPatterns.isEmpty()
0727: || source.getLastModified()
0728: .after(m_lastRebuild)) {
0729: rebuild = true;
0730: }
0731: }
0732:
0733: if (att != null) {
0734: if (m_spamPatterns == null || m_spamPatterns.isEmpty()
0735: || att.getLastModified().after(m_lastRebuild)) {
0736: rebuild = true;
0737: }
0738: }
0739:
0740: //
0741: // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete
0742: // filter list regardless of what changed.
0743: //
0744:
0745: if (rebuild) {
0746: m_lastRebuild = new Date();
0747:
0748: m_spamPatterns = parseWordList(source,
0749: (source != null) ? (String) source
0750: .getAttribute(LISTVAR) : null);
0751:
0752: log
0753: .info("Spam filter reloaded - recognizing "
0754: + m_spamPatterns.size()
0755: + " patterns from page "
0756: + m_forbiddenWordsPage);
0757:
0758: if (att != null) {
0759: InputStream in = context.getEngine()
0760: .getAttachmentManager()
0761: .getAttachmentStream(att);
0762:
0763: StringWriter out = new StringWriter();
0764:
0765: FileUtil.copyContents(new InputStreamReader(in,
0766: "UTF-8"), out);
0767:
0768: Collection blackList = parseBlacklist(out
0769: .toString());
0770:
0771: log
0772: .info("...recognizing additional "
0773: + blackList.size()
0774: + " patterns from blacklist "
0775: + m_blacklist);
0776:
0777: m_spamPatterns.addAll(blackList);
0778: }
0779: }
0780: } catch (IOException ex) {
0781: log.info("Unable to read attachment data, continuing...",
0782: ex);
0783: } catch (ProviderException ex) {
0784: log
0785: .info(
0786: "Failed to read spam filter attachment, continuing...",
0787: ex);
0788: }
0789:
0790: }
0791:
0792: /**
0793: * Does a check against a known pattern list.
0794: *
0795: * @param context
0796: * @param content
0797: * @param change
0798: * @throws RedirectException
0799: */
0800: private void checkPatternList(WikiContext context, String content,
0801: String change) throws RedirectException {
0802: //
0803: // If we have no spam patterns defined, or we're trying to save
0804: // the page containing the patterns, just return.
0805: //
0806: if (m_spamPatterns == null
0807: || context.getPage().getName().equals(
0808: m_forbiddenWordsPage)) {
0809: return;
0810: }
0811:
0812: if (context.getHttpRequest() != null)
0813: change += context.getHttpRequest().getRemoteAddr();
0814:
0815: for (Iterator i = m_spamPatterns.iterator(); i.hasNext();) {
0816: Pattern p = (Pattern) i.next();
0817:
0818: // log.debug("Attempting to match page contents with "+p.getPattern());
0819:
0820: if (m_matcher.contains(change, p)) {
0821: //
0822: // Spam filter has a match.
0823: //
0824: String uid = log(context, REJECT, REASON_REGEXP + "("
0825: + p.getPattern() + ")", change);
0826:
0827: log.info("SPAM:Regexp (" + uid
0828: + "). Content matches the spam filter '"
0829: + p.getPattern() + "'");
0830:
0831: checkStrategy(
0832: context,
0833: REASON_REGEXP,
0834: "Herb says '"
0835: + p.getPattern()
0836: + "' is a bad spam word and I trust Herb! (Incident code "
0837: + uid + ")");
0838: }
0839: }
0840: }
0841:
0842: /**
0843: * Creates a simple text string describing the added content.
0844: *
0845: * @param context
0846: * @param newText
0847: * @return Empty string, if there is no change.
0848: */
0849: private static String getChange(WikiContext context, String newText) {
0850: WikiPage page = context.getPage();
0851: StringBuffer change = new StringBuffer();
0852: WikiEngine engine = context.getEngine();
0853: // Get current page version
0854:
0855: try {
0856: String oldText = engine.getPureText(page.getName(),
0857: WikiProvider.LATEST_VERSION);
0858:
0859: String[] first = Diff.stringToArray(oldText);
0860: String[] second = Diff.stringToArray(newText);
0861: Revision rev = Diff.diff(first, second, new MyersDiff());
0862:
0863: if (rev == null || rev.size() == 0) {
0864: return "";
0865: }
0866:
0867: for (int i = 0; i < rev.size(); i++) {
0868: Delta d = rev.getDelta(i);
0869:
0870: if (d instanceof AddDelta) {
0871: d.getRevised().toString(change, "", "\r\n");
0872: } else if (d instanceof ChangeDelta) {
0873: d.getRevised().toString(change, "", "\r\n");
0874: }
0875: }
0876: } catch (DifferentiationFailedException e) {
0877: log.error("Diff failed", e);
0878: }
0879:
0880: //
0881: // Don't forget to include the change note, too
0882: //
0883: String changeNote = (String) page
0884: .getAttribute(WikiPage.CHANGENOTE);
0885:
0886: if (changeNote != null) {
0887: change.append("\r\n");
0888: change.append(changeNote);
0889: }
0890:
0891: //
0892: // And author as well
0893: //
0894:
0895: if (page.getAuthor() != null) {
0896: change.append("\r\n" + page.getAuthor());
0897: }
0898:
0899: return change.toString();
0900: }
0901:
0902: /**
0903: * Returns true, if this user should be ignored.
0904: *
0905: * @param context
0906: * @return
0907: */
0908: private boolean ignoreThisUser(WikiContext context) {
0909: if (context.hasAdminPermissions()) {
0910: return true;
0911: }
0912:
0913: if (m_ignoreAuthenticated
0914: && context.getWikiSession().isAuthenticated()) {
0915: return true;
0916: }
0917:
0918: if (context.getVariable("captcha") != null) {
0919: return true;
0920: }
0921:
0922: return false;
0923: }
0924:
0925: /**
0926: * Returns a random string of six uppercase characters.
0927: *
0928: * @return A random string
0929: */
0930: private static String getUniqueID() {
0931: StringBuffer sb = new StringBuffer();
0932: Random rand = new Random();
0933:
0934: for (int i = 0; i < 6; i++) {
0935: char x = (char) ('A' + rand.nextInt(26));
0936:
0937: sb.append(x);
0938: }
0939:
0940: return sb.toString();
0941: }
0942:
0943: /**
0944: * Returns a page to which we shall redirect, based on the current value
0945: * of the "captcha" parameter.
0946: *
0947: * @param ctx WikiContext
0948: * @return An URL to redirect to
0949: */
0950: private String getRedirectPage(WikiContext ctx) {
0951: if (m_useCaptcha)
0952: return ctx.getURL(WikiContext.NONE, "Captcha.jsp", "page="
0953: + ctx.getEngine().encodeName(
0954: ctx.getPage().getName()));
0955:
0956: return ctx.getURL(WikiContext.VIEW, m_errorPage);
0957: }
0958:
0959: /**
0960: * Checks whether the UserProfile matches certain checks.
0961: *
0962: * @param profile
0963: * @return
0964: * @since 2.6.1
0965: */
0966: public boolean isValidUserProfile(WikiContext context,
0967: UserProfile profile) {
0968: try {
0969: checkPatternList(context, profile.getEmail(), profile
0970: .getEmail());
0971: checkPatternList(context, profile.getFullname(), profile
0972: .getFullname());
0973: checkPatternList(context, profile.getLoginName(), profile
0974: .getLoginName());
0975: } catch (RedirectException e) {
0976: log
0977: .info("Detected attempt to create a spammer user account (see above for rejection reason)");
0978: return false;
0979: }
0980:
0981: return true;
0982: }
0983:
0984: /**
0985: * This method is used to calculate an unique code when submitting the page
0986: * to detect edit conflicts. It currently incorporates the last-modified
0987: * date of the page, and the IP address of the submitter.
0988: *
0989: * @param page The WikiPage under edit
0990: * @param request The HTTP Request
0991: * @since 2.6
0992: * @return A hash value for this page and session
0993: */
0994: public static final String getSpamHash(WikiPage page,
0995: HttpServletRequest request) {
0996: long lastModified = 0;
0997:
0998: if (page.getLastModified() != null)
0999: lastModified = page.getLastModified().getTime();
1000:
1001: long remote = request.getRemoteAddr().hashCode();
1002:
1003: return Long.toString(lastModified ^ remote);
1004: }
1005:
1006: /**
1007: * Returns the name of the hash field to be used in this request.
1008: * The value is unique per session, and once the session has expired,
1009: * you cannot edit anymore.
1010: *
1011: * @param request The page request
1012: * @return The name to be used in the hash field
1013: * @since 2.6
1014: */
1015:
1016: private static String c_hashName;
1017: private static long c_lastUpdate;
1018:
1019: /** The HASH_DELAY value is a maximum amount of time that an user can keep
1020: * a session open, because after the value has expired, we will invent a new
1021: * hash field name. By default this is {@value} hours, which should be ample
1022: * time for someone.
1023: */
1024: private static final long HASH_DELAY = 24;
1025:
1026: public static final String getHashFieldName(
1027: HttpServletRequest request) {
1028: String hash = null;
1029:
1030: if (request.getSession() != null) {
1031: hash = (String) request.getSession().getAttribute("_hash");
1032:
1033: if (hash == null) {
1034: hash = c_hashName;
1035:
1036: request.getSession().setAttribute("_hash", hash);
1037: }
1038: }
1039:
1040: if (c_hashName == null
1041: || c_lastUpdate < (System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000)) {
1042: c_hashName = getUniqueID().toLowerCase();
1043:
1044: c_lastUpdate = System.currentTimeMillis();
1045: }
1046:
1047: return hash != null ? hash : c_hashName;
1048: }
1049:
1050: /**
1051: * This method checks if the hash value is still valid, i.e. if it exists at all. This
1052: * can occur in two cases: either this is a spam bot which is not adaptive, or it is
1053: * someone who has been editing one page for too long, and their session has expired.
1054: * <p>
1055: * This method puts a redirect to the http response field to page "SessionExpired"
1056: * and logs the incident in the spam log (it may or may not be spam, but it's rather likely
1057: * that it is).
1058: *
1059: * @param context
1060: * @param pageContext
1061: * @return True, if hash is okay. False, if hash is not okay, and you need to redirect.
1062: * @throws IOException If redirection fails
1063: * @since 2.6
1064: */
1065: public static final boolean checkHash(WikiContext context,
1066: PageContext pageContext) throws IOException {
1067: String hashName = getHashFieldName((HttpServletRequest) pageContext
1068: .getRequest());
1069:
1070: if (pageContext.getRequest().getParameter(hashName) == null) {
1071: if (pageContext.getAttribute(hashName) == null) {
1072: String change = getChange(context, EditorManager
1073: .getEditedText(pageContext));
1074:
1075: log(context, REJECT, "MissingHash", change);
1076:
1077: String redirect = context.getURL(WikiContext.VIEW,
1078: "SessionExpired");
1079: ((HttpServletResponse) pageContext.getResponse())
1080: .sendRedirect(redirect);
1081:
1082: return false;
1083: }
1084: }
1085:
1086: return true;
1087: }
1088:
1089: public static final String insertInputFields(PageContext pageContext) {
1090: WikiContext ctx = WikiContext.findContext(pageContext);
1091: WikiEngine engine = ctx.getEngine();
1092:
1093: StringBuffer sb = new StringBuffer();
1094: if (engine.getContentEncoding().equals("UTF-8")) {
1095: sb
1096: .append("<input name='encodingcheck' type='hidden' value='\u3041' />\n");
1097: }
1098:
1099: return sb.toString();
1100: }
1101:
1102: /**
1103: * A local class for storing host information.
1104: *
1105: * @author jalkanen
1106: *
1107: * @since
1108: */
1109: private class Host {
1110: private long m_addedTime = System.currentTimeMillis();
1111: private long m_releaseTime;
1112: private String m_address;
1113: private String m_change;
1114:
1115: public String getAddress() {
1116: return m_address;
1117: }
1118:
1119: public long getReleaseTime() {
1120: return m_releaseTime;
1121: }
1122:
1123: public long getAddedTime() {
1124: return m_addedTime;
1125: }
1126:
1127: public String getChange() {
1128: return m_change;
1129: }
1130:
1131: public Host(String ipaddress, String change) {
1132: m_address = ipaddress;
1133: m_change = change;
1134:
1135: m_releaseTime = System.currentTimeMillis() + m_banTime * 60
1136: * 1000L;
1137: }
1138: }
1139: }
|