001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * MakeDensityBasedClusterer.java
019: * Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.clusterers;
024:
025: import weka.core.Capabilities;
026: import weka.core.Instance;
027: import weka.core.Instances;
028: import weka.core.Option;
029: import weka.core.OptionHandler;
030: import weka.core.Utils;
031: import weka.core.WeightedInstancesHandler;
032: import weka.estimators.DiscreteEstimator;
033: import weka.filters.unsupervised.attribute.ReplaceMissingValues;
034:
035: import java.util.Enumeration;
036: import java.util.Vector;
037:
038: /**
039: <!-- globalinfo-start -->
040: * Class for wrapping a Clusterer to make it return a distribution and density. Fits normal distributions and discrete distributions within each cluster produced by the wrapped clusterer. Supports the NumberOfClustersRequestable interface only if the wrapped Clusterer does.
041: * <p/>
042: <!-- globalinfo-end -->
043: *
044: <!-- options-start -->
045: * Valid options are: <p/>
046: *
047: * <pre> -M <num>
048: * minimum allowable standard deviation for normal density computation
049: * (default 1e-6)</pre>
050: *
051: * <pre> -W <clusterer name>
052: * Clusterer to wrap.
053: * (default weka.clusterers.SimpleKMeans)</pre>
054: *
055: * <pre>
056: * Options specific to clusterer weka.clusterers.SimpleKMeans:
057: * </pre>
058: *
059: * <pre> -N <num>
060: * number of clusters. (default = 2).</pre>
061: *
062: * <pre> -S <num>
063: * random number seed.
064: * (default 10)</pre>
065: *
066: <!-- options-end -->
067: *
068: * Options after "--" are passed on to the base clusterer.
069: *
070: * @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
071: * @author Mark Hall (mhall@cs.waikato.ac.nz)
072: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
073: * @version $Revision: 1.13 $
074: */
075: public class MakeDensityBasedClusterer extends DensityBasedClusterer
076: implements NumberOfClustersRequestable, OptionHandler,
077: WeightedInstancesHandler {
078:
079: /** for serialization */
080: static final long serialVersionUID = -5643302427972186631L;
081:
082: /** holds training instances header information */
083: private Instances m_theInstances;
084: /** prior probabilities for the fitted clusters */
085: private double[] m_priors;
086: /** normal distributions fitted to each numeric attribute in each cluster */
087: private double[][][] m_modelNormal;
088: /** discrete distributions fitted to each discrete attribute in each cluster */
089: private DiscreteEstimator[][] m_model;
090: /** default minimum standard deviation */
091: private double m_minStdDev = 1e-6;
092: /** The clusterer being wrapped */
093: private Clusterer m_wrappedClusterer = new weka.clusterers.SimpleKMeans();
094: /** globally replace missing values */
095: private ReplaceMissingValues m_replaceMissing;
096:
097: /**
098: * Default constructor.
099: *
100: */
101: public MakeDensityBasedClusterer() {
102: super ();
103: }
104:
105: /**
106: * Contructs a MakeDensityBasedClusterer wrapping a given Clusterer.
107: *
108: * @param toWrap the clusterer to wrap around
109: */
110: public MakeDensityBasedClusterer(Clusterer toWrap) {
111:
112: setClusterer(toWrap);
113: }
114:
115: /**
116: * Returns a string describing classifier
117: * @return a description suitable for
118: * displaying in the explorer/experimenter gui
119: */
120: public String globalInfo() {
121: return "Class for wrapping a Clusterer to make it return a distribution "
122: + "and density. Fits normal distributions and discrete distributions "
123: + "within each cluster produced by the wrapped clusterer. Supports the "
124: + "NumberOfClustersRequestable interface only if the wrapped Clusterer "
125: + "does.";
126: }
127:
128: /**
129: * String describing default clusterer.
130: *
131: * @return the default clusterer classname
132: */
133: protected String defaultClustererString() {
134: return SimpleKMeans.class.getName();
135: }
136:
137: /**
138: * Set the number of clusters to generate.
139: *
140: * @param n the number of clusters to generate
141: * @throws Exception if the wrapped clusterer has not been set, or if
142: * the wrapped clusterer does not implement this facility.
143: */
144: public void setNumClusters(int n) throws Exception {
145: if (m_wrappedClusterer == null) {
146: throw new Exception(
147: "Can't set the number of clusters to generate - "
148: + "no clusterer has been set yet.");
149: }
150: if (!(m_wrappedClusterer instanceof NumberOfClustersRequestable)) {
151: throw new Exception(
152: "Can't set the number of clusters to generate - "
153: + "wrapped clusterer does not support this facility.");
154: }
155:
156: ((NumberOfClustersRequestable) m_wrappedClusterer)
157: .setNumClusters(n);
158: }
159:
160: /**
161: * Returns default capabilities of the clusterer (i.e., of the wrapper
162: * clusterer).
163: *
164: * @return the capabilities of this clusterer
165: */
166: public Capabilities getCapabilities() {
167: if (m_wrappedClusterer != null)
168: return m_wrappedClusterer.getCapabilities();
169: else
170: return super .getCapabilities();
171: }
172:
173: /**
174: * Builds a clusterer for a set of instances.
175: *
176: * @param data the instances to train the clusterer with
177: * @throws Exception if the clusterer hasn't been set or something goes wrong
178: */
179: public void buildClusterer(Instances data) throws Exception {
180: // can clusterer handle the data?
181: getCapabilities().testWithFail(data);
182:
183: m_replaceMissing = new ReplaceMissingValues();
184: m_replaceMissing.setInputFormat(data);
185: data = weka.filters.Filter.useFilter(data, m_replaceMissing);
186:
187: m_theInstances = new Instances(data, 0);
188: if (m_wrappedClusterer == null) {
189: throw new Exception("No clusterer has been set");
190: }
191: m_wrappedClusterer.buildClusterer(data);
192: m_model = new DiscreteEstimator[m_wrappedClusterer
193: .numberOfClusters()][data.numAttributes()];
194: m_modelNormal = new double[m_wrappedClusterer
195: .numberOfClusters()][data.numAttributes()][2];
196: double[][] weights = new double[m_wrappedClusterer
197: .numberOfClusters()][data.numAttributes()];
198: m_priors = new double[m_wrappedClusterer.numberOfClusters()];
199: for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
200: for (int j = 0; j < data.numAttributes(); j++) {
201: if (data.attribute(j).isNominal()) {
202: m_model[i][j] = new DiscreteEstimator(data
203: .attribute(j).numValues(), true);
204: }
205: }
206: }
207:
208: Instance inst = null;
209:
210: // Compute mean, etc.
211: int[] clusterIndex = new int[data.numInstances()];
212: for (int i = 0; i < data.numInstances(); i++) {
213: inst = data.instance(i);
214: int cluster = m_wrappedClusterer.clusterInstance(inst);
215: m_priors[cluster] += inst.weight();
216: for (int j = 0; j < data.numAttributes(); j++) {
217: if (!inst.isMissing(j)) {
218: if (data.attribute(j).isNominal()) {
219: m_model[cluster][j].addValue(inst.value(j),
220: inst.weight());
221: } else {
222: m_modelNormal[cluster][j][0] += inst.weight()
223: * inst.value(j);
224: weights[cluster][j] += inst.weight();
225: }
226: }
227: }
228: clusterIndex[i] = cluster;
229: }
230:
231: for (int j = 0; j < data.numAttributes(); j++) {
232: if (data.attribute(j).isNumeric()) {
233: for (int i = 0; i < m_wrappedClusterer
234: .numberOfClusters(); i++) {
235: if (weights[i][j] > 0) {
236: m_modelNormal[i][j][0] /= weights[i][j];
237: }
238: }
239: }
240: }
241:
242: // Compute standard deviations
243: for (int i = 0; i < data.numInstances(); i++) {
244: inst = data.instance(i);
245: for (int j = 0; j < data.numAttributes(); j++) {
246: if (!inst.isMissing(j)) {
247: if (data.attribute(j).isNumeric()) {
248: double diff = m_modelNormal[clusterIndex[i]][j][0]
249: - inst.value(j);
250: m_modelNormal[clusterIndex[i]][j][1] += inst
251: .weight()
252: * diff * diff;
253: }
254: }
255: }
256: }
257:
258: for (int j = 0; j < data.numAttributes(); j++) {
259: if (data.attribute(j).isNumeric()) {
260: for (int i = 0; i < m_wrappedClusterer
261: .numberOfClusters(); i++) {
262: if (weights[i][j] > 0) {
263: m_modelNormal[i][j][1] = Math
264: .sqrt(m_modelNormal[i][j][1]
265: / weights[i][j]);
266: } else if (weights[i][j] <= 0) {
267: m_modelNormal[i][j][1] = Double.MAX_VALUE;
268: }
269: if (m_modelNormal[i][j][1] <= m_minStdDev) {
270: m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev;
271: if (m_modelNormal[i][j][1] <= m_minStdDev) {
272: m_modelNormal[i][j][1] = m_minStdDev;
273: }
274: }
275: }
276: }
277: }
278:
279: Utils.normalize(m_priors);
280: }
281:
282: /**
283: * Returns the cluster priors.
284: *
285: * @return the cluster priors
286: */
287: public double[] clusterPriors() {
288:
289: double[] n = new double[m_priors.length];
290:
291: System.arraycopy(m_priors, 0, n, 0, n.length);
292: return n;
293: }
294:
295: /**
296: * Computes the log of the conditional density (per cluster) for a given instance.
297: *
298: * @param inst the instance to compute the density for
299: * @return an array containing the estimated densities
300: * @throws Exception if the density could not be computed
301: * successfully
302: */
303: public double[] logDensityPerClusterForInstance(Instance inst)
304: throws Exception {
305:
306: int i, j;
307: double logprob;
308: double[] wghts = new double[m_wrappedClusterer
309: .numberOfClusters()];
310:
311: m_replaceMissing.input(inst);
312: inst = m_replaceMissing.output();
313:
314: for (i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
315: logprob = 0;
316: for (j = 0; j < inst.numAttributes(); j++) {
317: if (!inst.isMissing(j)) {
318: if (inst.attribute(j).isNominal()) {
319: logprob += Math.log(m_model[i][j]
320: .getProbability(inst.value(j)));
321: } else { // numeric attribute
322: logprob += logNormalDens(inst.value(j),
323: m_modelNormal[i][j][0],
324: m_modelNormal[i][j][1]);
325: }
326: }
327: }
328: wghts[i] = logprob;
329: }
330: return wghts;
331: }
332:
333: /** Constant for normal distribution. */
334: private static double m_normConst = 0.5 * Math.log(2 * Math.PI);
335:
336: /**
337: * Density function of normal distribution.
338: * @param x input value
339: * @param mean mean of distribution
340: * @param stdDev standard deviation of distribution
341: * @return the density
342: */
343: private double logNormalDens(double x, double mean, double stdDev) {
344:
345: double diff = x - mean;
346:
347: return -(diff * diff / (2 * stdDev * stdDev)) - m_normConst
348: - Math.log(stdDev);
349: }
350:
351: /**
352: * Returns the number of clusters.
353: *
354: * @return the number of clusters generated for a training dataset.
355: * @throws Exception if number of clusters could not be returned successfully
356: */
357: public int numberOfClusters() throws Exception {
358:
359: return m_wrappedClusterer.numberOfClusters();
360: }
361:
362: /**
363: * Returns a description of the clusterer.
364: *
365: * @return a string containing a description of the clusterer
366: */
367: public String toString() {
368: StringBuffer text = new StringBuffer();
369: text
370: .append("MakeDensityBasedClusterer: \n\nWrapped clusterer: "
371: + m_wrappedClusterer.toString());
372:
373: text
374: .append("\nFitted estimators (with ML estimates of variance):\n");
375:
376: for (int j = 0; j < m_priors.length; j++) {
377: text.append("\nCluster: " + j + " Prior probability: "
378: + Utils.doubleToString(m_priors[j], 4) + "\n\n");
379:
380: for (int i = 0; i < m_model[0].length; i++) {
381: text.append("Attribute: "
382: + m_theInstances.attribute(i).name() + "\n");
383:
384: if (m_theInstances.attribute(i).isNominal()) {
385: if (m_model[j][i] != null) {
386: text.append(m_model[j][i].toString());
387: }
388: } else {
389: text.append("Normal Distribution. Mean = "
390: + Utils.doubleToString(
391: m_modelNormal[j][i][0], 4)
392: + " StdDev = "
393: + Utils.doubleToString(
394: m_modelNormal[j][i][1], 4) + "\n");
395: }
396: }
397: }
398:
399: return text.toString();
400: }
401:
402: /**
403: * Returns the tip text for this property
404: * @return tip text for this property suitable for
405: * displaying in the explorer/experimenter gui
406: */
407: public String clustererTipText() {
408: return "the clusterer to wrap";
409: }
410:
411: /**
412: * Sets the clusterer to wrap.
413: *
414: * @param toWrap the clusterer
415: */
416: public void setClusterer(Clusterer toWrap) {
417:
418: m_wrappedClusterer = toWrap;
419: }
420:
421: /**
422: * Gets the clusterer being wrapped.
423: *
424: * @return the clusterer
425: */
426: public Clusterer getClusterer() {
427:
428: return m_wrappedClusterer;
429: }
430:
431: /**
432: * Returns the tip text for this property
433: * @return tip text for this property suitable for
434: * displaying in the explorer/experimenter gui
435: */
436: public String minStdDevTipText() {
437: return "set minimum allowable standard deviation";
438: }
439:
440: /**
441: * Set the minimum value for standard deviation when calculating
442: * normal density. Reducing this value can help prevent arithmetic
443: * overflow resulting from multiplying large densities (arising from small
444: * standard deviations) when there are many singleton or near singleton
445: * values.
446: * @param m minimum value for standard deviation
447: */
448: public void setMinStdDev(double m) {
449: m_minStdDev = m;
450: }
451:
452: /**
453: * Get the minimum allowable standard deviation.
454: * @return the minumum allowable standard deviation
455: */
456: public double getMinStdDev() {
457: return m_minStdDev;
458: }
459:
460: /**
461: * Returns an enumeration describing the available options..
462: *
463: * @return an enumeration of all the available options.
464: */
465: public Enumeration listOptions() {
466: Vector result = new Vector();
467:
468: result.addElement(new Option(
469: "\tminimum allowable standard deviation for normal density computation "
470: + "\n\t(default 1e-6)", "M", 1, "-M <num>"));
471:
472: result.addElement(new Option("\tClusterer to wrap.\n"
473: + "\t(default " + defaultClustererString() + ")", "W",
474: 1, "-W <clusterer name>"));
475:
476: if ((m_wrappedClusterer != null)
477: && (m_wrappedClusterer instanceof OptionHandler)) {
478: result.addElement(new Option("", "", 0,
479: "\nOptions specific to clusterer "
480: + m_wrappedClusterer.getClass().getName()
481: + ":"));
482: Enumeration enu = ((OptionHandler) m_wrappedClusterer)
483: .listOptions();
484: while (enu.hasMoreElements()) {
485: result.addElement(enu.nextElement());
486: }
487: }
488:
489: return result.elements();
490: }
491:
492: /**
493: * Parses a given list of options. <p/>
494: *
495: <!-- options-start -->
496: * Valid options are: <p/>
497: *
498: * <pre> -M <num>
499: * minimum allowable standard deviation for normal density computation
500: * (default 1e-6)</pre>
501: *
502: * <pre> -W <clusterer name>
503: * Clusterer to wrap.
504: * (default weka.clusterers.SimpleKMeans)</pre>
505: *
506: * <pre>
507: * Options specific to clusterer weka.clusterers.SimpleKMeans:
508: * </pre>
509: *
510: * <pre> -N <num>
511: * number of clusters. (default = 2).</pre>
512: *
513: * <pre> -S <num>
514: * random number seed.
515: * (default 10)</pre>
516: *
517: <!-- options-end -->
518: *
519: * @param options the list of options as an array of strings
520: * @throws Exception if an option is not supported
521: */
522: public void setOptions(String[] options) throws Exception {
523:
524: String optionString = Utils.getOption('M', options);
525: if (optionString.length() != 0)
526: setMinStdDev((new Double(optionString)).doubleValue());
527: else
528: setMinStdDev(1e-6);
529:
530: String wString = Utils.getOption('W', options);
531: if (wString.length() == 0)
532: wString = defaultClustererString();
533: setClusterer(Clusterer.forName(wString, Utils
534: .partitionOptions(options)));
535: }
536:
537: /**
538: * Gets the current settings of the clusterer.
539: *
540: * @return an array of strings suitable for passing to setOptions()
541: */
542: public String[] getOptions() {
543:
544: String[] clustererOptions = new String[0];
545: if ((m_wrappedClusterer != null)
546: && (m_wrappedClusterer instanceof OptionHandler)) {
547: clustererOptions = ((OptionHandler) m_wrappedClusterer)
548: .getOptions();
549: }
550: String[] options = new String[clustererOptions.length + 5];
551: int current = 0;
552:
553: options[current++] = "-M";
554: options[current++] = "" + getMinStdDev();
555:
556: if (getClusterer() != null) {
557: options[current++] = "-W";
558: options[current++] = getClusterer().getClass().getName();
559: }
560: options[current++] = "--";
561:
562: System.arraycopy(clustererOptions, 0, options, current,
563: clustererOptions.length);
564: current += clustererOptions.length;
565: while (current < options.length) {
566: options[current++] = "";
567: }
568: return options;
569: }
570:
571: /**
572: * Main method for testing this class.
573: *
574: * @param argv the options
575: */
576: public static void main(String[] argv) {
577: runClusterer(new MakeDensityBasedClusterer(), argv);
578: }
579: }
|