001: // The contents of this file are subject to the Mozilla Public License Version
002: // 1.1
003: //(the "License"); you may not use this file except in compliance with the
004: //License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
005: //
006: //Software distributed under the License is distributed on an "AS IS" basis,
007: //WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
008: //for the specific language governing rights and
009: //limitations under the License.
010: //
011: //The Original Code is "The Columba Project"
012: //
013: //The Initial Developers of the Original Code are Frederik Dietz and Timo
014: // Stich.
015: //Portions created by Frederik Dietz and Timo Stich are Copyright (C) 2003.
016: //
017: //All Rights Reserved.
018: package org.columba.mail.spam;
019:
020: import java.io.File;
021: import java.io.IOException;
022: import java.io.InputStream;
023: import java.security.NoSuchAlgorithmException;
024: import java.util.ArrayList;
025: import java.util.Enumeration;
026: import java.util.List;
027: import java.util.logging.Logger;
028:
029: import javax.swing.JOptionPane;
030:
031: import org.columba.core.config.DefaultConfigDirectory;
032: import org.columba.core.gui.frame.FrameManager;
033: import org.columba.core.io.CloneStreamMaster;
034: import org.columba.core.logging.Logging;
035: import org.columba.mail.folder.IMailbox;
036: import org.columba.mail.spam.command.CommandHelper;
037: import org.columba.mail.spam.rules.RuleList;
038: import org.columba.ristretto.message.Header;
039: import org.macchiato.DBWrapper;
040: import org.macchiato.Message;
041: import org.macchiato.SpamFilter;
042: import org.macchiato.SpamFilterImpl;
043: import org.macchiato.db.FrequencyDB;
044: import org.macchiato.db.MD5SumHelper;
045: import org.macchiato.db.berkleydb.BerkleyFrequencyDBImpl;
046: import org.macchiato.log.MacchiatoLogger;
047: import org.macchiato.maps.ProbabilityMap;
048:
049: /**
050: * Built-in spam filter using the Macchiato library.
051: * <p>
052: * Note, that its necessary for this filter to train a few hundred messages,
053: * before its starting to work. I'm usually starting with around 1000 messages
054: * while keeping it up-to-date with messages which are scored wrong.
055: * <p>
056: * If training mode is enabled, the spam filter automatically adds messages to
057: * its frequency database.
058: *
059: * @author fdietz
060: */
061: public class MacchiatoPlugin implements ISpamPlugin {
062:
063: /** JDK 1.4+ logging framework logger, used for logging. */
064: private static final Logger LOG = Logger
065: .getLogger("org.columba.core.gui.htmlviewer");
066:
067: /**
068: * Delete messages from DB, if DB size > THRESHOLD
069: */
070: public final static int THRESHOLD = 200000;
071:
072: /**
073: * Delete messages from DB after 7 days, if they don't affect the scoring
074: * process because of low occurences.
075: */
076: public final static int AGE = 7;
077:
078: /**
079: * spam filter in macchiator library doing the actual work
080: */
081: private SpamFilter filter;
082:
083: /**
084: * database of tokens, storing occurences of tokens, etc.
085: */
086: private FrequencyDB db;
087:
088: /**
089: * file to store the token database
090: */
091: private File file;
092:
093: /**
094: * dirty flag for database changes
095: */
096: private boolean hasChanged = false;
097:
098: /**
099: * is cache already loaded?
100: */
101: private boolean alreadyLoaded = false;
102:
103: /**
104: *
105: */
106: public MacchiatoPlugin() {
107: // create directory <config-folder>/mail/spamdb
108: File configDirectory = DefaultConfigDirectory.getInstance()
109: .getCurrentPath();
110: File mailDirectory = new File(configDirectory, "mail");
111: file = new File(mailDirectory, "spamdb");
112: if (!file.exists())
113: file.mkdir();
114: db = new DBWrapper(new BerkleyFrequencyDBImpl(file));
115:
116: filter = new SpamFilterImpl(db);
117:
118: // make Columba logger parent of macchiato logger
119: MacchiatoLogger.setParentLogger(Logger
120: .getLogger("org.columba.mail.spam"));
121:
122: }
123:
124: /**
125: * Score message. Using a threshold of 90% here. Every message with at least
126: * 90% is spam. This value should be increased in the future.
127: *
128: * @see org.columba.mail.spam.ISpamPlugin#scoreMessage(org.columba.mail.folder.IMailbox,
129: * java.lang.Object)
130: */
131: public boolean scoreMessage(IMailbox mailbox, Object uid)
132: throws Exception {
133: // load database from file
134: load();
135:
136: // get inputstream of message body
137: InputStream istream = CommandHelper.getBodyPart(mailbox, uid);
138:
139: // we are using this inpustream multiple times
140: // --> istream will be closed by CloneStreamMaster
141: CloneStreamMaster master = new CloneStreamMaster(istream);
142:
143: // get stream
144: istream = master.getClone();
145:
146: // apply additional handcrafted rules
147: ProbabilityMap map = RuleList.getInstance().getProbabilities(
148: mailbox, uid);
149:
150: float score = filter.scoreMessage(new Message(istream), map);
151:
152: return score >= 0.9f;
153: }
154:
155: /**
156: * @see org.columba.mail.spam.ISpamPlugin#trainMessageAsSpam(org.columba.mail.folder.IMailbox,
157: * java.lang.Object)
158: */
159: public void trainMessageAsSpam(IMailbox mailbox, Object uid)
160: throws Exception {
161: // get inputstream of message body
162: InputStream istream = CommandHelper.getBodyPart(mailbox, uid);
163:
164: // get headers
165: Header h = mailbox.getHeaderFields(uid, Message.HEADERFIELDS);
166:
167: // put headers in list
168: Enumeration e = h.getKeys();
169: List list = new ArrayList();
170:
171: while (e.hasMoreElements()) {
172: String key = (String) e.nextElement();
173: list.add(h.get(key));
174: }
175:
176: // load database from file
177: load();
178:
179: try {
180: CloneStreamMaster master = new CloneStreamMaster(istream);
181: InputStream inputStream = master.getClone();
182:
183: byte[] md5sum = MD5SumHelper.createMD5(inputStream);
184: // close stream
185: inputStream.close();
186:
187: // get new inputstream
188: inputStream = master.getClone();
189:
190: Message message = new Message(inputStream, list, md5sum);
191: // check if this message was already learned
192: // -> only add if this is not the case
193: if (db.MD5SumExists(md5sum)) {
194: // message already exists
195: // --> correct token data
196: filter.correctMessageAsSpam(message);
197: } else {
198: // new message
199: filter.trainMessageAsSpam(message);
200: }
201:
202: // close stream
203: inputStream.close();
204:
205: // set dirty flag
206: hasChanged = true;
207: } catch (IOException e1) {
208: LOG.severe(e1.getMessage());
209: if (Logging.DEBUG)
210: e1.printStackTrace();
211: } catch (NoSuchAlgorithmException nsae) {
212: } // does not occur
213:
214: }
215:
216: /**
217: * @see org.columba.mail.spam.ISpamPlugin#trainMessageAsHam(org.columba.mail.folder.IMailbox,
218: * java.lang.Object)
219: */
220: public void trainMessageAsHam(IMailbox mailbox, Object uid)
221: throws Exception {
222: // get inputstream of message body
223: InputStream istream = CommandHelper.getBodyPart(mailbox, uid);
224:
225: // get headers
226: Header h = mailbox.getHeaderFields(uid, Message.HEADERFIELDS);
227:
228: // put headers in list
229: Enumeration e = h.getKeys();
230: List list = new ArrayList();
231:
232: while (e.hasMoreElements()) {
233: String key = (String) e.nextElement();
234: list.add(h.get(key));
235: }
236:
237: // load database from file
238: load();
239:
240: try {
241: CloneStreamMaster master = new CloneStreamMaster(istream);
242: InputStream inputStream = master.getClone();
243:
244: byte[] md5sum = MD5SumHelper.createMD5(inputStream);
245: // close stream
246: inputStream.close();
247:
248: // get new inputstream
249: inputStream = master.getClone();
250: Message message = new Message(inputStream, list, md5sum);
251:
252: // check if this message was already learned
253: if (db.MD5SumExists(md5sum)) {
254: // message already exists
255:
256: // --> correct token data
257: filter.correctMessageAsHam(message);
258: } else {
259: // new message
260:
261: filter.trainMessageAsHam(message);
262: }
263:
264: // close stream
265: inputStream.close();
266:
267: // set dirty flag
268: hasChanged = true;
269: } catch (IOException e1) {
270: LOG.severe(e1.getMessage());
271: if (Logging.DEBUG)
272: e1.printStackTrace();
273: } catch (NoSuchAlgorithmException nsae) {
274: } // does not occur
275:
276: }
277:
278: /**
279: * @see org.columba.mail.spam.ISpamPlugin#save()
280: */
281: public void save() {
282: try {
283: // only save if changes exist
284: if (alreadyLoaded && hasChanged) {
285: // cleanup DB -> remove old tokens
286: db.cleanupDB(THRESHOLD);
287:
288: // close DB
289: db.close();
290: }
291: } catch (Exception e) {
292: if (Logging.DEBUG) {
293: e.printStackTrace();
294: }
295: // TODO (@author fdietz): i18n
296: int value = JOptionPane.showConfirmDialog(FrameManager
297: .getInstance().getActiveFrame(),
298: "An error occured while saving the spam database.\n"
299: + "Try again?", "Error saving database",
300: JOptionPane.YES_NO_OPTION,
301: JOptionPane.WARNING_MESSAGE);
302: if (value == JOptionPane.YES_OPTION) {
303: save();
304: }
305: }
306:
307: }
308:
309: /**
310: * @see org.columba.mail.spam.ISpamPlugin#load()
311: */
312: public void load() {
313: /*
314: * try { // only load if necessary if (!alreadyLoaded && file.exists()) {
315: * FrequencyIO.load(db, file); }
316: *
317: * alreadyLoaded = true; } catch (IOException e) {
318: * JOptionPane.showMessageDialog(
319: * MainInterface.frameModel.getActiveFrame(), "An error occured while
320: * loading the spam database.\n" + "I will use an empty one.", "Error
321: * loading database", JOptionPane.ERROR_MESSAGE); if
322: * (MainInterface.DEBUG) { e.printStackTrace(); } // fail-case db = new
323: * FrequencyDBImpl();
324: *
325: * alreadyLoaded = true; }
326: */
327: }
328:
329: }
|