001: /****************************************************************
002: * Licensed to the Apache Software Foundation (ASF) under one *
003: * or more contributor license agreements. See the NOTICE file *
004: * distributed with this work for additional information *
005: * regarding copyright ownership. The ASF licenses this file *
006: * to you under the Apache License, Version 2.0 (the *
007: * "License"); you may not use this file except in compliance *
008: * with the License. You may obtain a copy of the License at *
009: * *
010: * http://www.apache.org/licenses/LICENSE-2.0 *
011: * *
012: * Unless required by applicable law or agreed to in writing, *
013: * software distributed under the License is distributed on an *
014: * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
015: * KIND, either express or implied. See the License for the *
016: * specific language governing permissions and limitations *
017: * under the License. *
018: ****************************************************************/package org.apache.james.util;
019:
020: import java.util.Map;
021: import java.util.Set;
022: import java.util.SortedSet;
023: import java.util.TreeSet;
024: import java.util.HashMap;
025: import java.util.HashSet;
026: import java.util.Iterator;
027: import java.util.Collection;
028: import java.util.ArrayList;
029:
030: import java.io.Reader;
031: import java.io.StreamTokenizer;
032: import java.io.StringReader;
033:
034: /**
035: * <P>Determines probability that text contains Spam.</P>
036: *
037: * <P>Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan for Spam</a>.
038: * Extended to Paul Grahams' <a href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.</P>
039: *
040: * <P>Sample method usage:</P>
041: *
042: * <P>Use:
043: * void addHam(Reader)
044: * and
045: * void addSpam(Reader)
046: *
047: * methods to build up the Maps of ham & spam tokens/occurrences.
048: * Both addHam and addSpam assume they're reading one message at a time,
049: * if you feed more than one message per call, be sure to adjust the
050: * appropriate message counter: hamMessageCount or spamMessageCount.
051: *
052: * Then...</P>
053: *
054: * <P>Use:
055: * void buildCorpus()
056: *
057: * to build the final token/probabilities Map.
058: *
059: * Use your own methods for persistent storage of either the individual
060: * ham/spam corpus & message counts, and/or the final corpus.
061: *
062: * Then you can...</P>
063: *
064: * <P>Use:
065: * double computeSpamProbability(Reader)
066: *
067: * to determine the probability that a particular text contains spam.
068: * A returned result of 0.9 or above is an indicator that the text was
069: * spam.</P>
070: *
071: * <P>If you use persistent storage, use:
072: * void setCorpus(Map)
073: *
074: * before calling computeSpamProbability.</P>
075: *
076: * @version CVS $Revision: $ $Date: $
077: * @since 2.3.0
078: */
079:
080: public class BayesianAnalyzer {
081:
082: /**
083: * Number of "interesting" tokens to use to compute overall
084: * spamminess probability.
085: */
086: private final static int MAX_INTERESTING_TOKENS = 15;
087:
088: /**
089: * Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall
090: * spamminess probability.
091: */
092: private final static double INTERESTINGNESS_THRESHOLD = 0.46;
093:
094: /**
095: * Default token probability to use when a token has not been
096: * encountered before.
097: */
098: private final static double DEFAULT_TOKEN_PROBABILITY = 0.4;
099:
100: /**
101: * Map of ham tokens and their occurrences.
102: *
103: * String key
104: * Integer value
105: */
106: private Map hamTokenCounts = new HashMap();
107:
108: /**
109: * Map of spam tokens and their occurrences.
110: *
111: * String key
112: * Integer value
113: */
114: private Map spamTokenCounts = new HashMap();
115:
116: /**
117: * Number of ham messages analyzed.
118: */
119: private int hamMessageCount = 0;
120:
121: /**
122: * Number of spam messages analyzed.
123: */
124: private int spamMessageCount = 0;
125:
126: /**
127: * Final token/probability corpus.
128: *
129: * String key
130: * Double value
131: */
132: private Map corpus = new HashMap();
133:
134: /**
135: * Inner class for managing Token Probability Strengths during the
136: * computeSpamProbability phase.
137: *
138: * By probability <i>strength</i> we mean the absolute distance of a
139: * probability from the middle value 0.5.
140: *
141: * It implements Comparable so that it's sorting is automatic.
142: */
143: private class TokenProbabilityStrength implements Comparable {
144: /**
145: * Message token.
146: */
147: String token = null;
148:
149: /**
150: * Token's computed probability strength.
151: */
152: double strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
153:
154: /**
155: * Force the natural sort order for this object to be high-to-low.
156: *
157: * @param anotherTokenProbabilityStrength A TokenProbabilityStrength instance to compare
158: * this instance with.
159: *
160: * @return The result of the comparison (before, equal, after).
161: */
162: public final int compareTo(
163: Object anotherTokenProbabilityStrength) {
164: int result = (int) ((((TokenProbabilityStrength) anotherTokenProbabilityStrength).strength - strength) * 1000000);
165: if (result == 0) {
166: return this .token
167: .compareTo(((TokenProbabilityStrength) anotherTokenProbabilityStrength).token);
168: } else {
169: return result;
170: }
171: }
172:
173: /**
174: * Simple toString () implementation mostly for debugging purposes.
175: *
176: * @return String representation of this object.
177: */
178: public String toString() {
179: StringBuffer sb = new StringBuffer(30);
180:
181: sb.append(token).append("=").append(strength);
182:
183: return sb.toString();
184: }
185: }
186:
187: /**
188: * Basic class constructor.
189: */
190: public BayesianAnalyzer() {
191: }
192:
193: /**
194: * Public setter for the hamTokenCounts Map.
195: *
196: * @param hamTokenCounts The new ham Token counts Map.
197: */
198: public void setHamTokenCounts(Map hamTokenCounts) {
199: this .hamTokenCounts = hamTokenCounts;
200: }
201:
202: /**
203: * Public getter for the hamTokenCounts Map.
204: */
205: public Map getHamTokenCounts() {
206: return this .hamTokenCounts;
207: }
208:
209: /**
210: * Public setter for the spamTokenCounts Map.
211: *
212: * @param spamTokenCounts The new spam Token counts Map.
213: */
214: public void setSpamTokenCounts(Map spamTokenCounts) {
215: this .spamTokenCounts = spamTokenCounts;
216: }
217:
218: /**
219: * Public getter for the spamTokenCounts Map.
220: */
221: public Map getSpamTokenCounts() {
222: return this .spamTokenCounts;
223: }
224:
225: /**
226: * Public setter for spamMessageCount.
227: *
228: * @param spamMessageCount The new spam message count.
229: */
230: public void setSpamMessageCount(int spamMessageCount) {
231: this .spamMessageCount = spamMessageCount;
232: }
233:
234: /**
235: * Public getter for spamMessageCount.
236: */
237: public int getSpamMessageCount() {
238: return this .spamMessageCount;
239: }
240:
241: /**
242: * Public setter for hamMessageCount.
243: *
244: * @param hamMessageCount The new ham message count.
245: */
246: public void setHamMessageCount(int hamMessageCount) {
247: this .hamMessageCount = hamMessageCount;
248: }
249:
250: /**
251: * Public getter for hamMessageCount.
252: */
253: public int getHamMessageCount() {
254: return this .hamMessageCount;
255: }
256:
257: /**
258: * Clears all analysis repositories and counters.
259: */
260: public void clear() {
261: corpus.clear();
262:
263: tokenCountsClear();
264:
265: hamMessageCount = 0;
266: spamMessageCount = 0;
267: }
268:
269: /**
270: * Clears token counters.
271: */
272: public void tokenCountsClear() {
273: hamTokenCounts.clear();
274: spamTokenCounts.clear();
275: }
276:
277: /**
278: * Public setter for corpus.
279: *
280: * @param corpus The new corpus.
281: */
282: public void setCorpus(Map corpus) {
283: this .corpus = corpus;
284: }
285:
286: /**
287: * Public getter for corpus.
288: */
289: public Map getCorpus() {
290: return this .corpus;
291: }
292:
293: /**
294: * Builds the corpus from the existing ham & spam counts.
295: */
296: public void buildCorpus() {
297: //Combine the known ham & spam tokens.
298: Set set = new HashSet(hamTokenCounts.size()
299: + spamTokenCounts.size());
300: set.addAll(hamTokenCounts.keySet());
301: set.addAll(spamTokenCounts.keySet());
302: Map tempCorpus = new HashMap(set.size());
303:
304: //Iterate through all the tokens and compute their new
305: //individual probabilities.
306: Iterator i = set.iterator();
307: while (i.hasNext()) {
308: String token = (String) i.next();
309: tempCorpus
310: .put(token, new Double(computeProbability(token)));
311: }
312: setCorpus(tempCorpus);
313: }
314:
315: /**
316: * Adds a message to the ham list.
317: * @param stream A reader stream on the ham message to analyze
318: * @throws IOException If any error occurs
319: */
320: public void addHam(Reader stream) throws java.io.IOException {
321: addTokenOccurrences(stream, hamTokenCounts);
322: hamMessageCount++;
323: }
324:
325: /**
326: * Adds a message to the spam list.
327: * @param stream A reader stream on the spam message to analyze
328: * @throws IOException If any error occurs
329: */
330: public void addSpam(Reader stream) throws java.io.IOException {
331: addTokenOccurrences(stream, spamTokenCounts);
332: spamMessageCount++;
333: }
334:
335: /**
336: * Computes the probability that the stream contains SPAM.
337: * @param stream The text to be analyzed for Spamminess.
338: * @return A 0.0 - 1.0 probability
339: * @throws IOException If any error occurs
340: */
341: public double computeSpamProbability(Reader stream)
342: throws java.io.IOException {
343: //Build a set of the tokens in the Stream.
344: Set tokens = parse(stream);
345:
346: // Get the corpus to use in this run
347: // A new corpus may be being built in the meantime
348: Map workCorpus = getCorpus();
349:
350: //Assign their probabilities from the Corpus (using an additional
351: //calculation to determine spamminess).
352: SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(
353: tokens, workCorpus);
354:
355: //Compute and return the overall probability that the
356: //stream is SPAM.
357: return computeOverallProbability(tokenProbabilityStrengths,
358: workCorpus);
359: }
360:
361: /**
362: * Parses a stream into tokens, and updates the target Map
363: * with the token/counts.
364: *
365: * @param stream
366: * @param target
367: */
368: private void addTokenOccurrences(Reader stream, Map target)
369: throws java.io.IOException {
370: String token;
371: String header = "";
372:
373: //Update target with the tokens/count encountered.
374: while ((token = nextToken(stream)) != null) {
375: boolean endingLine = false;
376: if (token.length() > 0
377: && token.charAt(token.length() - 1) == '\n') {
378: endingLine = true;
379: token = token.substring(0, token.length() - 1);
380: }
381:
382: if (token.length() > 0
383: && header.length() + token.length() < 90
384: && !allDigits(token)) {
385: if (token.equals("From:")
386: || token.equals("Return-Path:")
387: || token.equals("Subject:")
388: || token.equals("To:")) {
389: header = token;
390: if (!endingLine) {
391: continue;
392: }
393: }
394:
395: token = header + token;
396:
397: Integer value = null;
398:
399: if (target.containsKey(token)) {
400: value = new Integer(((Integer) target.get(token))
401: .intValue() + 1);
402: } else {
403: value = new Integer(1);
404: }
405:
406: target.put(token, value);
407: }
408:
409: if (endingLine) {
410: header = "";
411: }
412: }
413: }
414:
415: /**
416: * Parses a stream into tokens, and returns a Set of
417: * the unique tokens encountered.
418: *
419: * @param stream
420: * @return Set
421: */
422: private Set parse(Reader stream) throws java.io.IOException {
423: Set tokens = new HashSet();
424: String token;
425: String header = "";
426:
427: //Build a Map of tokens encountered.
428: while ((token = nextToken(stream)) != null) {
429: boolean endingLine = false;
430: if (token.length() > 0
431: && token.charAt(token.length() - 1) == '\n') {
432: endingLine = true;
433: token = token.substring(0, token.length() - 1);
434: }
435:
436: if (token.length() > 0
437: && header.length() + token.length() < 90
438: && !allDigits(token)) {
439: if (token.equals("From:")
440: || token.equals("Return-Path:")
441: || token.equals("Subject:")
442: || token.equals("To:")) {
443: header = token;
444: if (!endingLine) {
445: continue;
446: }
447: }
448:
449: token = header + token;
450:
451: tokens.add(token);
452: }
453:
454: if (endingLine) {
455: header = "";
456: }
457: }
458:
459: //Return the unique set of tokens encountered.
460: return tokens;
461: }
462:
463: private String nextToken(Reader reader) throws java.io.IOException {
464: StringBuffer token = new StringBuffer();
465: int i;
466: char ch, ch2;
467: boolean previousWasDigit = false;
468: boolean tokenCharFound = false;
469:
470: if (!reader.ready()) {
471: return null;
472: }
473:
474: while ((i = reader.read()) != -1) {
475:
476: ch = (char) i;
477:
478: if (ch == ':') {
479: String tokenString = token.toString() + ':';
480: if (tokenString.equals("From:")
481: || tokenString.equals("Return-Path:")
482: || tokenString.equals("Subject:")
483: || tokenString.equals("To:")) {
484: return tokenString;
485: }
486: }
487:
488: if (Character.isLetter(ch) || ch == '-' || ch == '$'
489: || ch == '\u20AC' // the EURO symbol
490: || ch == '!' || ch == '\'') {
491: tokenCharFound = true;
492: previousWasDigit = false;
493: token.append(ch);
494: } else if (Character.isDigit(ch)) {
495: tokenCharFound = true;
496: previousWasDigit = true;
497: token.append(ch);
498: } else if (previousWasDigit && (ch == '.' || ch == ',')) {
499: reader.mark(1);
500: previousWasDigit = false;
501: i = reader.read();
502: if (i == -1) {
503: break;
504: }
505: ch2 = (char) i;
506: if (Character.isDigit(ch2)) {
507: tokenCharFound = true;
508: previousWasDigit = true;
509: token.append(ch);
510: token.append(ch2);
511: } else {
512: reader.reset();
513: break;
514: }
515: } else if (ch == '\r') {
516: // cr found, ignore
517: } else if (ch == '\n') {
518: // eol found
519: tokenCharFound = true;
520: previousWasDigit = false;
521: token.append(ch);
522: break;
523: } else if (tokenCharFound) {
524: break;
525: }
526: }
527:
528: if (tokenCharFound) {
529: // System.out.println("Token read: " + token);
530: return token.toString();
531: } else {
532: return null;
533: }
534: }
535:
536: /**
537: * Compute the probability that "token" is SPAM.
538: *
539: * @param token
540: * @return The probability that the token occurs within spam.
541: */
542: private double computeProbability(String token) {
543: double hamFactor = 0;
544: double spamFactor = 0;
545:
546: boolean foundInHam = false;
547: boolean foundInSpam = false;
548:
549: double minThreshold = 0.01;
550: double maxThreshold = 0.99;
551:
552: if (hamTokenCounts.containsKey(token)) {
553: foundInHam = true;
554: }
555:
556: if (spamTokenCounts.containsKey(token)) {
557: foundInSpam = true;
558: }
559:
560: if (foundInHam) {
561: hamFactor = 2 * ((Integer) hamTokenCounts.get(token))
562: .doubleValue();
563: if (!foundInSpam) {
564: minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
565: }
566: }
567:
568: if (foundInSpam) {
569: spamFactor = ((Integer) spamTokenCounts.get(token))
570: .doubleValue();
571: if (!foundInHam) {
572: maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
573: }
574: }
575:
576: if ((hamFactor + spamFactor) < 5) {
577: //This token hasn't been seen enough.
578: return 0.4;
579: }
580:
581: double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
582: double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
583:
584: return Math.max(minThreshold, Math.min(maxThreshold,
585: (spamFreq / (hamFreq + spamFreq))));
586: }
587:
588: /**
589: * Returns a SortedSet of TokenProbabilityStrength built from the
590: * Corpus and the tokens passed in the "tokens" Set.
591: * The ordering is from the highest strength to the lowest strength.
592: *
593: * @param tokens
594: * @param workCorpus
595: * @return SortedSet of TokenProbabilityStrength objects.
596: */
597: private SortedSet getTokenProbabilityStrengths(Set tokens,
598: Map workCorpus) {
599: //Convert to a SortedSet of token probability strengths.
600: SortedSet tokenProbabilityStrengths = new TreeSet();
601:
602: Iterator i = tokens.iterator();
603: while (i.hasNext()) {
604: TokenProbabilityStrength tps = new TokenProbabilityStrength();
605:
606: tps.token = (String) i.next();
607:
608: if (workCorpus.containsKey(tps.token)) {
609: tps.strength = Math.abs(0.5 - ((Double) workCorpus
610: .get(tps.token)).doubleValue());
611: } else {
612: //This token has never been seen before,
613: //we'll give it initially the default probability.
614: Double corpusProbability = new Double(
615: DEFAULT_TOKEN_PROBABILITY);
616: tps.strength = Math
617: .abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
618: boolean isTokenDegeneratedFound = false;
619:
620: Collection degeneratedTokens = buildDegenerated(tps.token);
621: Iterator iDegenerated = degeneratedTokens.iterator();
622: String tokenDegenerated = null;
623: double strengthDegenerated;
624: while (iDegenerated.hasNext()) {
625: tokenDegenerated = (String) iDegenerated.next();
626: if (workCorpus.containsKey(tokenDegenerated)) {
627: Double probabilityTemp = (Double) workCorpus
628: .get(tokenDegenerated);
629: strengthDegenerated = Math
630: .abs(0.5 - probabilityTemp
631: .doubleValue());
632: if (strengthDegenerated > tps.strength) {
633: isTokenDegeneratedFound = true;
634: tps.strength = strengthDegenerated;
635: corpusProbability = probabilityTemp;
636: }
637: }
638: }
639: // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
640: if (isTokenDegeneratedFound) {
641: synchronized (workCorpus) {
642: workCorpus.put(tps.token, corpusProbability);
643: }
644: }
645: }
646:
647: tokenProbabilityStrengths.add(tps);
648: }
649:
650: return tokenProbabilityStrengths;
651: }
652:
653: private Collection buildDegenerated(String fullToken) {
654: ArrayList tokens = new ArrayList();
655: String header;
656: String token;
657:
658: // look for a header string termination
659: int headerEnd = fullToken.indexOf(':');
660: if (headerEnd >= 0) {
661: header = fullToken.substring(0, headerEnd);
662: token = fullToken.substring(headerEnd);
663: } else {
664: header = "";
665: token = fullToken;
666: }
667:
668: int end = token.length();
669: do {
670: if (!token.substring(0, end).equals(
671: token.substring(0, end).toLowerCase())) {
672: tokens.add(header
673: + token.substring(0, end).toLowerCase());
674: if (header.length() > 0) {
675: tokens.add(token.substring(0, end).toLowerCase());
676: }
677: }
678: if (end > 1 && token.charAt(0) >= 'A'
679: && token.charAt(0) <= 'Z') {
680: tokens.add(header + token.charAt(0)
681: + token.substring(1, end).toLowerCase());
682: if (header.length() > 0) {
683: tokens.add(token.charAt(0)
684: + token.substring(1, end).toLowerCase());
685: }
686: }
687:
688: if (token.charAt(end - 1) != '!') {
689: break;
690: }
691:
692: end--;
693:
694: tokens.add(header + token.substring(0, end));
695: if (header.length() > 0) {
696: tokens.add(token.substring(0, end));
697: }
698: } while (end > 0);
699:
700: return tokens;
701: }
702:
703: /**
704: * Compute the spamminess probability of the interesting tokens in
705: * the tokenProbabilities SortedSet.
706: *
707: * @param tokenProbabilities
708: * @param workCorpus
709: * @return Computed spamminess.
710: */
711: private double computeOverallProbability(
712: SortedSet tokenProbabilityStrengths, Map workCorpus) {
713: double p = 1.0;
714: double np = 1.0;
715: double tempStrength = 0.5;
716: int count = MAX_INTERESTING_TOKENS;
717: Iterator iterator = tokenProbabilityStrengths.iterator();
718: while ((iterator.hasNext())
719: && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
720: TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator
721: .next();
722: tempStrength = tps.strength;
723:
724: // System.out.println(tps);
725:
726: double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
727: Double theDoubleObject = (Double) workCorpus.get(tps.token);
728: // if either the original token or a degeneration was found use the double value, otherwise use the default
729: if (theDoubleObject != null) {
730: theDoubleValue = theDoubleObject.doubleValue();
731: }
732: p *= theDoubleValue;
733: np *= (1.0 - theDoubleValue);
734: // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
735: }
736:
737: return (p / (p + np));
738: }
739:
740: private boolean allSameChar(String s) {
741: if (s.length() < 2) {
742: return false;
743: }
744:
745: char c = s.charAt(0);
746:
747: for (int i = 1; i < s.length(); i++) {
748: if (s.charAt(i) != c) {
749: return false;
750: }
751: }
752: return true;
753: }
754:
755: private boolean allDigits(String s) {
756: for (int i = 0; i < s.length(); i++) {
757: if (!Character.isDigit(s.charAt(i))) {
758: return false;
759: }
760: }
761: return true;
762: }
763: }
|