001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * RuleGeneration.java
019: * Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.associations;
024:
025: import java.io.*;
026: import java.util.*;
027: import java.lang.Math;
028: import weka.core.*;
029: import weka.associations.ItemSet;
030:
031: /**
032: * Class implementing the rule generation procedure of the predictive apriori algorithm.
033: *
034: * Reference: T. Scheffer (2001). <i>Finding Association Rules That Trade Support
035: * Optimally against Confidence</i>. Proc of the 5th European Conf.
036: * on Principles and Practice of Knowledge Discovery in Databases (PKDD'01),
037: * pp. 424-435. Freiburg, Germany: Springer-Verlag. <p>
038: *
039: * The implementation follows the paper expect for adding a rule to the output of the
040: * <i>n</i> best rules. A rule is added if:
041: * the expected predictive accuracy of this rule is among the <i>n</i> best and it is
042: * not subsumed by a rule with at least the same expected predictive accuracy
043: * (out of an unpublished manuscript from T. Scheffer).
044: *
045: * @author Stefan Mutter (mutter@cs.waikato.ac.nz)
046: * @version $Revision: 1.3 $ */
047: public class RuleGeneration implements Serializable {
048:
049: /** for serialization */
050: private static final long serialVersionUID = -8927041669872491432L;
051:
052: /** The items stored as an array of of integer. */
053: protected int[] m_items;
054:
055: /** Counter for how many transactions contain this item set. */
056: protected int m_counter;
057:
058: /** The total number of transactions */
059: protected int m_totalTransactions;
060:
061: /** Flag indicating whether the list fo the best rules has changed. */
062: protected boolean m_change = false;
063:
064: /** The minimum expected predictive accuracy that is needed to be a candidate for the list of the best rules. */
065: protected double m_expectation;
066:
067: /** Threshold. If the support of the premise is higher the binomial distrubution is approximated by a normal one. */
068: protected static final int MAX_N = 300;
069:
070: /** The minimum support a rule needs to be a candidate for the list of the best rules. */
071: protected int m_minRuleCount;
072:
073: /** Sorted array of the mied points of the intervals used for prior estimation. */
074: protected double[] m_midPoints;
075:
076: /** Hashtable conatining the estimated prior probabilities. */
077: protected Hashtable m_priors;
078:
079: /** The list of the actual <i>n</i> best rules. */
080: protected TreeSet m_best;
081:
082: /** Integer indicating the generation time of a rule. */
083: protected int m_count;
084:
085: /** The instances. */
086: protected Instances m_instances;
087:
088: /**
089: * Constructor
090: * @param itemSet item set for that rules should be generated.
091: * The item set will form the premise of the rules.
092: */
093: public RuleGeneration(ItemSet itemSet) {
094:
095: m_totalTransactions = itemSet.m_totalTransactions;
096: m_counter = itemSet.m_counter;
097: m_items = itemSet.m_items;
098: }
099:
100: /**
101: * calculates the probability using a binomial distribution.
102: * If the support of the premise is too large this distribution
103: * is approximated by a normal distribution.
104: * @param accuracy the accuracy value
105: * @param ruleCount the support of the whole rule
106: * @param premiseCount the support of the premise
107: * @return the probability value
108: */
109: public static final double binomialDistribution(double accuracy,
110: double ruleCount, double premiseCount) {
111:
112: double mu, sigma;
113:
114: if (premiseCount < MAX_N)
115: return Math
116: .pow(
117: 2,
118: (Utils.log2(Math.pow(accuracy, ruleCount))
119: + Utils
120: .log2(Math
121: .pow(
122: (1.0 - accuracy),
123: (premiseCount - ruleCount))) + PriorEstimation
124: .logbinomialCoefficient(
125: (int) premiseCount,
126: (int) ruleCount)));
127: else {
128: mu = premiseCount * accuracy;
129: sigma = Math.sqrt((premiseCount * (1.0 - accuracy))
130: * accuracy);
131: return Statistics
132: .normalProbability(((ruleCount + 0.5) - mu)
133: / (sigma * Math.sqrt(2)));
134: }
135: }
136:
137: /**
138: * calculates the expected predctive accuracy of a rule
139: * @param ruleCount the support of the rule
140: * @param premiseCount the premise support of the rule
141: * @param midPoints array with all mid points
142: * @param priors hashtable containing the prior probabilities
143: * @return the expected predictive accuracy
144: */
145: public static final double expectation(double ruleCount,
146: int premiseCount, double[] midPoints, Hashtable priors) {
147:
148: double numerator = 0, denominator = 0;
149: for (int i = 0; i < midPoints.length; i++) {
150: Double actualPrior = (Double) priors.get(new Double(
151: midPoints[i]));
152: if (actualPrior != null) {
153: if (actualPrior.doubleValue() != 0) {
154: double addend = actualPrior.doubleValue()
155: * binomialDistribution(midPoints[i],
156: ruleCount, (double) premiseCount);
157: denominator += addend;
158: numerator += addend * midPoints[i];
159: }
160: }
161: }
162: if (denominator <= 0 || Double.isNaN(denominator))
163: System.out.println("RuleItem denominator: " + denominator);
164: if (numerator <= 0 || Double.isNaN(numerator))
165: System.out.println("RuleItem numerator: " + numerator);
166: return numerator / denominator;
167: }
168:
169: /**
170: * Generates all rules for an item set. The item set is the premise.
171: * @param numRules the number of association rules the use wants to mine.
172: * This number equals the size <i>n</i> of the list of the
173: * best rules.
174: * @param midPoints the mid points of the intervals
175: * @param priors Hashtable that contains the prior probabilities
176: * @param expectation the minimum value of the expected predictive accuracy
177: * that is needed to get into the list of the best rules
178: * @param instances the instances for which association rules are generated
179: * @param best the list of the <i>n</i> best rules.
180: * The list is implemented as a TreeSet
181: * @param genTime the maximum time of generation
182: * @return all the rules with minimum confidence for the given item set
183: */
184: public TreeSet generateRules(int numRules, double[] midPoints,
185: Hashtable priors, double expectation, Instances instances,
186: TreeSet best, int genTime) {
187:
188: boolean redundant = false;
189: FastVector consequences = new FastVector(), consequencesMinusOne = new FastVector();
190: ItemSet premise;
191: int s = 0;
192: RuleItem current = null, old;
193:
194: Hashtable hashtable;
195:
196: m_change = false;
197: m_midPoints = midPoints;
198: m_priors = priors;
199: m_best = best;
200: m_expectation = expectation;
201: m_count = genTime;
202: m_instances = instances;
203:
204: //create rule body
205: premise = null;
206: premise = new ItemSet(m_totalTransactions);
207: premise.m_items = new int[m_items.length];
208: System
209: .arraycopy(m_items, 0, premise.m_items, 0,
210: m_items.length);
211: premise.m_counter = m_counter;
212:
213: do {
214: m_minRuleCount = 1;
215: while (expectation((double) m_minRuleCount,
216: premise.m_counter, m_midPoints, m_priors) <= m_expectation) {
217: m_minRuleCount++;
218: if (m_minRuleCount > premise.m_counter)
219: return m_best;
220: }
221: redundant = false;
222: for (int i = 0; i < instances.numAttributes(); i++) {
223: if (i == 0) {
224: for (int j = 0; j < m_items.length; j++)
225: if (m_items[j] == -1)
226: consequences = singleConsequence(instances,
227: j, consequences);
228: if (premise == null || consequences.size() == 0)
229: return m_best;
230: }
231: FastVector allRuleItems = new FastVector();
232: int index = 0;
233: do {
234: int h = 0;
235: while (h < consequences.size()) {
236: RuleItem dummie = new RuleItem();
237: current = dummie.generateRuleItem(premise,
238: (ItemSet) consequences.elementAt(h),
239: instances, m_count, m_minRuleCount,
240: m_midPoints, m_priors);
241: if (current != null) {
242: allRuleItems.addElement(current);
243: h++;
244: } else
245: consequences.removeElementAt(h);
246: }
247: if (index == i)
248: break;
249: consequencesMinusOne = consequences;
250: consequences = ItemSet.mergeAllItemSets(
251: consequencesMinusOne, index, instances
252: .numInstances());
253: hashtable = ItemSet.getHashtable(
254: consequencesMinusOne, consequencesMinusOne
255: .size());
256: consequences = ItemSet.pruneItemSets(consequences,
257: hashtable);
258: index++;
259: } while (consequences.size() > 0);
260: for (int h = 0; h < allRuleItems.size(); h++) {
261: current = (RuleItem) allRuleItems.elementAt(h);
262: m_count++;
263: if (m_best.size() < numRules) {
264: m_change = true;
265: redundant = removeRedundant(current);
266: } else {
267: if (current.accuracy() > m_expectation) {
268: m_expectation = ((RuleItem) (m_best.first()))
269: .accuracy();
270: boolean remove = m_best.remove(m_best
271: .first());
272: m_change = true;
273: redundant = removeRedundant(current);
274: m_expectation = ((RuleItem) (m_best.first()))
275: .accuracy();
276: while (expectation((double) m_minRuleCount,
277: (current.premise()).m_counter,
278: m_midPoints, m_priors) < m_expectation) {
279: m_minRuleCount++;
280: if (m_minRuleCount > (current.premise()).m_counter)
281: break;
282: }
283: }
284: }
285: }
286:
287: }
288: } while (redundant);
289: return m_best;
290: }
291:
292: /**
293: * Methods that decides whether or not rule a subsumes rule b.
294: * The defintion of subsumption is:
295: * Rule a subsumes rule b, if a subsumes b
296: * AND
297: * a has got least the same expected predictive accuracy as b.
298: * @param a an association rule stored as a RuleItem
299: * @param b an association rule stored as a RuleItem
300: * @return true if rule a subsumes rule b or false otherwise.
301: */
302: public static boolean aSubsumesB(RuleItem a, RuleItem b) {
303:
304: if (a.m_accuracy < b.m_accuracy)
305: return false;
306: for (int k = 0; k < a.premise().m_items.length; k++) {
307: if (a.premise().m_items[k] != b.premise().m_items[k]) {
308: if ((a.premise().m_items[k] != -1 && b.premise().m_items[k] != -1)
309: || b.premise().m_items[k] == -1)
310: return false;
311: }
312: if (a.consequence().m_items[k] != b.consequence().m_items[k]) {
313: if ((a.consequence().m_items[k] != -1 && b
314: .consequence().m_items[k] != -1)
315: || a.consequence().m_items[k] == -1)
316: return false;
317: }
318: }
319: return true;
320:
321: }
322:
323: /**
324: * generates a consequence of length 1 for an association rule.
325: * @param instances the instances under consideration
326: * @param attNum an item that does not occur in the premise
327: * @param consequences FastVector that possibly already contains other consequences of length 1
328: * @return FastVector with consequences of length 1
329: */
330: public static FastVector singleConsequence(Instances instances,
331: int attNum, FastVector consequences) {
332:
333: ItemSet consequence;
334:
335: for (int i = 0; i < instances.numAttributes(); i++) {
336: if (i == attNum) {
337: for (int j = 0; j < instances.attribute(i).numValues(); j++) {
338: consequence = new ItemSet(instances.numInstances());
339: consequence.m_items = new int[instances
340: .numAttributes()];
341: for (int k = 0; k < instances.numAttributes(); k++)
342: consequence.m_items[k] = -1;
343: consequence.m_items[i] = j;
344: consequences.addElement(consequence);
345: }
346: }
347: }
348: return consequences;
349:
350: }
351:
352: /**
353: * Method that removes redundant rules out of the list of the best rules.
354: * A rule is in that list if:
355: * the expected predictive accuracy of this rule is among the best and it is
356: * not subsumed by a rule with at least the same expected predictive accuracy
357: * @param toInsert the rule that should be inserted into the list
358: * @return true if the method has changed the list, false otherwise
359: */
360: public boolean removeRedundant(RuleItem toInsert) {
361:
362: boolean redundant = false, fSubsumesT = false, tSubsumesF = false;
363: RuleItem first;
364: int subsumes = 0;
365: Object[] best = m_best.toArray();
366: for (int i = 0; i < best.length; i++) {
367: first = (RuleItem) best[i];
368: fSubsumesT = aSubsumesB(first, toInsert);
369: tSubsumesF = aSubsumesB(toInsert, first);
370: if (fSubsumesT) {
371: subsumes = 1;
372: break;
373: } else {
374: if (tSubsumesF) {
375: boolean remove = m_best.remove(first);
376: subsumes = 2;
377: redundant = true;
378: }
379: }
380: }
381: if (subsumes == 0 || subsumes == 2)
382: m_best.add(toInsert);
383: return redundant;
384: }
385:
386: /**
387: * Gets the actual maximum value of the generation time
388: * @return the actual maximum value of the generation time
389: */
390: public int count() {
391:
392: return m_count;
393: }
394:
395: /**
396: * Gets if the list fo the best rules has been changed
397: * @return whether or not the list fo the best rules has been changed
398: */
399: public boolean change() {
400:
401: return m_change;
402: }
403: }
|