Source Code Cross Referenced for MakeDensityBasedClusterer.java in » Science » weka » weka » clusterers » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Science » weka » weka.clusterers
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         *    This program is free software; you can redistribute it and/or modify
003:         *    it under the terms of the GNU General Public License as published by
004:         *    the Free Software Foundation; either version 2 of the License, or
005:         *    (at your option) any later version.
006:         *
007:         *    This program is distributed in the hope that it will be useful,
008:         *    but WITHOUT ANY WARRANTY; without even the implied warranty of
009:         *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
010:         *    GNU General Public License for more details.
011:         *
012:         *    You should have received a copy of the GNU General Public License
013:         *    along with this program; if not, write to the Free Software
014:         *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015:         */
016:
017:        /*
018:         *    MakeDensityBasedClusterer.java
019:         *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
020:         *
021:         */
022:
023:        package weka.clusterers;
024:
025:        import weka.core.Capabilities;
026:        import weka.core.Instance;
027:        import weka.core.Instances;
028:        import weka.core.Option;
029:        import weka.core.OptionHandler;
030:        import weka.core.Utils;
031:        import weka.core.WeightedInstancesHandler;
032:        import weka.estimators.DiscreteEstimator;
033:        import weka.filters.unsupervised.attribute.ReplaceMissingValues;
034:
035:        import java.util.Enumeration;
036:        import java.util.Vector;
037:
038:        /**
039:         <!-- globalinfo-start -->
040:         * Class for wrapping a Clusterer to make it return a distribution and density. Fits normal distributions and discrete distributions within each cluster produced by the wrapped clusterer. Supports the NumberOfClustersRequestable interface only if the wrapped Clusterer does.
041:         * <p/>
042:         <!-- globalinfo-end -->
043:         *
044:         <!-- options-start -->
045:         * Valid options are: <p/>
046:         * 
047:         * <pre> -M &lt;num&gt;
048:         *  minimum allowable standard deviation for normal density computation 
049:         *  (default 1e-6)</pre>
050:         * 
051:         * <pre> -W &lt;clusterer name&gt;
052:         *  Clusterer to wrap.
053:         *  (default weka.clusterers.SimpleKMeans)</pre>
054:         * 
055:         * <pre> 
056:         * Options specific to clusterer weka.clusterers.SimpleKMeans:
057:         * </pre>
058:         * 
059:         * <pre> -N &lt;num&gt;
060:         *  number of clusters. (default = 2).</pre>
061:         * 
062:         * <pre> -S &lt;num&gt;
063:         *  random number seed.
064:         *  (default 10)</pre>
065:         * 
066:         <!-- options-end -->
067:         * 
068:         * Options after "--" are passed on to the base clusterer.
069:         *
070:         * @author Richard Kirkby (rkirkby@cs.waikato.ac.nz)
071:         * @author Mark Hall (mhall@cs.waikato.ac.nz)
072:         * @author Eibe Frank (eibe@cs.waikato.ac.nz)
073:         * @version $Revision: 1.13 $
074:         */
075:        public class MakeDensityBasedClusterer extends DensityBasedClusterer
076:                implements  NumberOfClustersRequestable, OptionHandler,
077:                WeightedInstancesHandler {
078:
079:            /** for serialization */
080:            static final long serialVersionUID = -5643302427972186631L;
081:
082:            /** holds training instances header information */
083:            private Instances m_theInstances;
084:            /** prior probabilities for the fitted clusters */
085:            private double[] m_priors;
086:            /** normal distributions fitted to each numeric attribute in each cluster */
087:            private double[][][] m_modelNormal;
088:            /** discrete distributions fitted to each discrete attribute in each cluster */
089:            private DiscreteEstimator[][] m_model;
090:            /** default minimum standard deviation */
091:            private double m_minStdDev = 1e-6;
092:            /** The clusterer being wrapped */
093:            private Clusterer m_wrappedClusterer = new weka.clusterers.SimpleKMeans();
094:            /** globally replace missing values */
095:            private ReplaceMissingValues m_replaceMissing;
096:
097:            /**
098:             * Default constructor.
099:             * 
100:             */
101:            public MakeDensityBasedClusterer() {
102:                super ();
103:            }
104:
105:            /**
106:             * Contructs a MakeDensityBasedClusterer wrapping a given Clusterer.
107:             * 
108:             * @param toWrap the clusterer to wrap around
109:             */
110:            public MakeDensityBasedClusterer(Clusterer toWrap) {
111:
112:                setClusterer(toWrap);
113:            }
114:
115:            /**
116:             * Returns a string describing classifier
117:             * @return a description suitable for
118:             * displaying in the explorer/experimenter gui
119:             */
120:            public String globalInfo() {
121:                return "Class for wrapping a Clusterer to make it return a distribution "
122:                        + "and density. Fits normal distributions and discrete distributions "
123:                        + "within each cluster produced by the wrapped clusterer. Supports the "
124:                        + "NumberOfClustersRequestable interface only if the wrapped Clusterer "
125:                        + "does.";
126:            }
127:
128:            /**
129:             * String describing default clusterer.
130:             * 
131:             * @return 		the default clusterer classname
132:             */
133:            protected String defaultClustererString() {
134:                return SimpleKMeans.class.getName();
135:            }
136:
137:            /**
138:             * Set the number of clusters to generate.
139:             *
140:             * @param n the number of clusters to generate
141:             * @throws Exception if the wrapped clusterer has not been set, or if
142:             * the wrapped clusterer does not implement this facility.
143:             */
144:            public void setNumClusters(int n) throws Exception {
145:                if (m_wrappedClusterer == null) {
146:                    throw new Exception(
147:                            "Can't set the number of clusters to generate - "
148:                                    + "no clusterer has been set yet.");
149:                }
150:                if (!(m_wrappedClusterer instanceof  NumberOfClustersRequestable)) {
151:                    throw new Exception(
152:                            "Can't set the number of clusters to generate - "
153:                                    + "wrapped clusterer does not support this facility.");
154:                }
155:
156:                ((NumberOfClustersRequestable) m_wrappedClusterer)
157:                        .setNumClusters(n);
158:            }
159:
160:            /**
161:             * Returns default capabilities of the clusterer (i.e., of the wrapper
162:             * clusterer).
163:             *
164:             * @return      the capabilities of this clusterer
165:             */
166:            public Capabilities getCapabilities() {
167:                if (m_wrappedClusterer != null)
168:                    return m_wrappedClusterer.getCapabilities();
169:                else
170:                    return super .getCapabilities();
171:            }
172:
173:            /**
174:             * Builds a clusterer for a set of instances.
175:             *
176:             * @param data the instances to train the clusterer with
177:             * @throws Exception if the clusterer hasn't been set or something goes wrong
178:             */
179:            public void buildClusterer(Instances data) throws Exception {
180:                // can clusterer handle the data?
181:                getCapabilities().testWithFail(data);
182:
183:                m_replaceMissing = new ReplaceMissingValues();
184:                m_replaceMissing.setInputFormat(data);
185:                data = weka.filters.Filter.useFilter(data, m_replaceMissing);
186:
187:                m_theInstances = new Instances(data, 0);
188:                if (m_wrappedClusterer == null) {
189:                    throw new Exception("No clusterer has been set");
190:                }
191:                m_wrappedClusterer.buildClusterer(data);
192:                m_model = new DiscreteEstimator[m_wrappedClusterer
193:                        .numberOfClusters()][data.numAttributes()];
194:                m_modelNormal = new double[m_wrappedClusterer
195:                        .numberOfClusters()][data.numAttributes()][2];
196:                double[][] weights = new double[m_wrappedClusterer
197:                        .numberOfClusters()][data.numAttributes()];
198:                m_priors = new double[m_wrappedClusterer.numberOfClusters()];
199:                for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
200:                    for (int j = 0; j < data.numAttributes(); j++) {
201:                        if (data.attribute(j).isNominal()) {
202:                            m_model[i][j] = new DiscreteEstimator(data
203:                                    .attribute(j).numValues(), true);
204:                        }
205:                    }
206:                }
207:
208:                Instance inst = null;
209:
210:                // Compute mean, etc.
211:                int[] clusterIndex = new int[data.numInstances()];
212:                for (int i = 0; i < data.numInstances(); i++) {
213:                    inst = data.instance(i);
214:                    int cluster = m_wrappedClusterer.clusterInstance(inst);
215:                    m_priors[cluster] += inst.weight();
216:                    for (int j = 0; j < data.numAttributes(); j++) {
217:                        if (!inst.isMissing(j)) {
218:                            if (data.attribute(j).isNominal()) {
219:                                m_model[cluster][j].addValue(inst.value(j),
220:                                        inst.weight());
221:                            } else {
222:                                m_modelNormal[cluster][j][0] += inst.weight()
223:                                        * inst.value(j);
224:                                weights[cluster][j] += inst.weight();
225:                            }
226:                        }
227:                    }
228:                    clusterIndex[i] = cluster;
229:                }
230:
231:                for (int j = 0; j < data.numAttributes(); j++) {
232:                    if (data.attribute(j).isNumeric()) {
233:                        for (int i = 0; i < m_wrappedClusterer
234:                                .numberOfClusters(); i++) {
235:                            if (weights[i][j] > 0) {
236:                                m_modelNormal[i][j][0] /= weights[i][j];
237:                            }
238:                        }
239:                    }
240:                }
241:
242:                // Compute standard deviations
243:                for (int i = 0; i < data.numInstances(); i++) {
244:                    inst = data.instance(i);
245:                    for (int j = 0; j < data.numAttributes(); j++) {
246:                        if (!inst.isMissing(j)) {
247:                            if (data.attribute(j).isNumeric()) {
248:                                double diff = m_modelNormal[clusterIndex[i]][j][0]
249:                                        - inst.value(j);
250:                                m_modelNormal[clusterIndex[i]][j][1] += inst
251:                                        .weight()
252:                                        * diff * diff;
253:                            }
254:                        }
255:                    }
256:                }
257:
258:                for (int j = 0; j < data.numAttributes(); j++) {
259:                    if (data.attribute(j).isNumeric()) {
260:                        for (int i = 0; i < m_wrappedClusterer
261:                                .numberOfClusters(); i++) {
262:                            if (weights[i][j] > 0) {
263:                                m_modelNormal[i][j][1] = Math
264:                                        .sqrt(m_modelNormal[i][j][1]
265:                                                / weights[i][j]);
266:                            } else if (weights[i][j] <= 0) {
267:                                m_modelNormal[i][j][1] = Double.MAX_VALUE;
268:                            }
269:                            if (m_modelNormal[i][j][1] <= m_minStdDev) {
270:                                m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev;
271:                                if (m_modelNormal[i][j][1] <= m_minStdDev) {
272:                                    m_modelNormal[i][j][1] = m_minStdDev;
273:                                }
274:                            }
275:                        }
276:                    }
277:                }
278:
279:                Utils.normalize(m_priors);
280:            }
281:
282:            /**
283:             * Returns the cluster priors.
284:             * 
285:             * @return the cluster priors
286:             */
287:            public double[] clusterPriors() {
288:
289:                double[] n = new double[m_priors.length];
290:
291:                System.arraycopy(m_priors, 0, n, 0, n.length);
292:                return n;
293:            }
294:
295:            /**
296:             * Computes the log of the conditional density (per cluster) for a given instance.
297:             * 
298:             * @param inst the instance to compute the density for
299:             * @return an array containing the estimated densities
300:             * @throws Exception if the density could not be computed
301:             * successfully
302:             */
303:            public double[] logDensityPerClusterForInstance(Instance inst)
304:                    throws Exception {
305:
306:                int i, j;
307:                double logprob;
308:                double[] wghts = new double[m_wrappedClusterer
309:                        .numberOfClusters()];
310:
311:                m_replaceMissing.input(inst);
312:                inst = m_replaceMissing.output();
313:
314:                for (i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
315:                    logprob = 0;
316:                    for (j = 0; j < inst.numAttributes(); j++) {
317:                        if (!inst.isMissing(j)) {
318:                            if (inst.attribute(j).isNominal()) {
319:                                logprob += Math.log(m_model[i][j]
320:                                        .getProbability(inst.value(j)));
321:                            } else { // numeric attribute
322:                                logprob += logNormalDens(inst.value(j),
323:                                        m_modelNormal[i][j][0],
324:                                        m_modelNormal[i][j][1]);
325:                            }
326:                        }
327:                    }
328:                    wghts[i] = logprob;
329:                }
330:                return wghts;
331:            }
332:
333:            /** Constant for normal distribution. */
334:            private static double m_normConst = 0.5 * Math.log(2 * Math.PI);
335:
336:            /**
337:             * Density function of normal distribution.
338:             * @param x input value
339:             * @param mean mean of distribution
340:             * @param stdDev standard deviation of distribution
341:             * @return the density
342:             */
343:            private double logNormalDens(double x, double mean, double stdDev) {
344:
345:                double diff = x - mean;
346:
347:                return -(diff * diff / (2 * stdDev * stdDev)) - m_normConst
348:                        - Math.log(stdDev);
349:            }
350:
351:            /**
352:             * Returns the number of clusters.
353:             *
354:             * @return the number of clusters generated for a training dataset.
355:             * @throws Exception if number of clusters could not be returned successfully
356:             */
357:            public int numberOfClusters() throws Exception {
358:
359:                return m_wrappedClusterer.numberOfClusters();
360:            }
361:
362:            /**
363:             * Returns a description of the clusterer.
364:             *
365:             * @return a string containing a description of the clusterer
366:             */
367:            public String toString() {
368:                StringBuffer text = new StringBuffer();
369:                text
370:                        .append("MakeDensityBasedClusterer: \n\nWrapped clusterer: "
371:                                + m_wrappedClusterer.toString());
372:
373:                text
374:                        .append("\nFitted estimators (with ML estimates of variance):\n");
375:
376:                for (int j = 0; j < m_priors.length; j++) {
377:                    text.append("\nCluster: " + j + " Prior probability: "
378:                            + Utils.doubleToString(m_priors[j], 4) + "\n\n");
379:
380:                    for (int i = 0; i < m_model[0].length; i++) {
381:                        text.append("Attribute: "
382:                                + m_theInstances.attribute(i).name() + "\n");
383:
384:                        if (m_theInstances.attribute(i).isNominal()) {
385:                            if (m_model[j][i] != null) {
386:                                text.append(m_model[j][i].toString());
387:                            }
388:                        } else {
389:                            text.append("Normal Distribution. Mean = "
390:                                    + Utils.doubleToString(
391:                                            m_modelNormal[j][i][0], 4)
392:                                    + " StdDev = "
393:                                    + Utils.doubleToString(
394:                                            m_modelNormal[j][i][1], 4) + "\n");
395:                        }
396:                    }
397:                }
398:
399:                return text.toString();
400:            }
401:
402:            /**
403:             * Returns the tip text for this property
404:             * @return tip text for this property suitable for
405:             * displaying in the explorer/experimenter gui
406:             */
407:            public String clustererTipText() {
408:                return "the clusterer to wrap";
409:            }
410:
411:            /**
412:             * Sets the clusterer to wrap.
413:             *
414:             * @param toWrap the clusterer
415:             */
416:            public void setClusterer(Clusterer toWrap) {
417:
418:                m_wrappedClusterer = toWrap;
419:            }
420:
421:            /**
422:             * Gets the clusterer being wrapped.
423:             *
424:             * @return the clusterer
425:             */
426:            public Clusterer getClusterer() {
427:
428:                return m_wrappedClusterer;
429:            }
430:
431:            /**
432:             * Returns the tip text for this property
433:             * @return tip text for this property suitable for
434:             * displaying in the explorer/experimenter gui
435:             */
436:            public String minStdDevTipText() {
437:                return "set minimum allowable standard deviation";
438:            }
439:
440:            /**
441:             * Set the minimum value for standard deviation when calculating
442:             * normal density. Reducing this value can help prevent arithmetic
443:             * overflow resulting from multiplying large densities (arising from small
444:             * standard deviations) when there are many singleton or near singleton
445:             * values.
446:             * @param m minimum value for standard deviation
447:             */
448:            public void setMinStdDev(double m) {
449:                m_minStdDev = m;
450:            }
451:
452:            /**
453:             * Get the minimum allowable standard deviation.
454:             * @return the minumum allowable standard deviation
455:             */
456:            public double getMinStdDev() {
457:                return m_minStdDev;
458:            }
459:
460:            /**
461:             * Returns an enumeration describing the available options..
462:             *
463:             * @return an enumeration of all the available options.
464:             */
465:            public Enumeration listOptions() {
466:                Vector result = new Vector();
467:
468:                result.addElement(new Option(
469:                        "\tminimum allowable standard deviation for normal density computation "
470:                                + "\n\t(default 1e-6)", "M", 1, "-M <num>"));
471:
472:                result.addElement(new Option("\tClusterer to wrap.\n"
473:                        + "\t(default " + defaultClustererString() + ")", "W",
474:                        1, "-W <clusterer name>"));
475:
476:                if ((m_wrappedClusterer != null)
477:                        && (m_wrappedClusterer instanceof  OptionHandler)) {
478:                    result.addElement(new Option("", "", 0,
479:                            "\nOptions specific to clusterer "
480:                                    + m_wrappedClusterer.getClass().getName()
481:                                    + ":"));
482:                    Enumeration enu = ((OptionHandler) m_wrappedClusterer)
483:                            .listOptions();
484:                    while (enu.hasMoreElements()) {
485:                        result.addElement(enu.nextElement());
486:                    }
487:                }
488:
489:                return result.elements();
490:            }
491:
492:            /**
493:             * Parses a given list of options. <p/>
494:             *
495:             <!-- options-start -->
496:             * Valid options are: <p/>
497:             * 
498:             * <pre> -M &lt;num&gt;
499:             *  minimum allowable standard deviation for normal density computation 
500:             *  (default 1e-6)</pre>
501:             * 
502:             * <pre> -W &lt;clusterer name&gt;
503:             *  Clusterer to wrap.
504:             *  (default weka.clusterers.SimpleKMeans)</pre>
505:             * 
506:             * <pre> 
507:             * Options specific to clusterer weka.clusterers.SimpleKMeans:
508:             * </pre>
509:             * 
510:             * <pre> -N &lt;num&gt;
511:             *  number of clusters. (default = 2).</pre>
512:             * 
513:             * <pre> -S &lt;num&gt;
514:             *  random number seed.
515:             *  (default 10)</pre>
516:             * 
517:             <!-- options-end -->
518:             *
519:             * @param options the list of options as an array of strings
520:             * @throws Exception if an option is not supported
521:             */
522:            public void setOptions(String[] options) throws Exception {
523:
524:                String optionString = Utils.getOption('M', options);
525:                if (optionString.length() != 0)
526:                    setMinStdDev((new Double(optionString)).doubleValue());
527:                else
528:                    setMinStdDev(1e-6);
529:
530:                String wString = Utils.getOption('W', options);
531:                if (wString.length() == 0)
532:                    wString = defaultClustererString();
533:                setClusterer(Clusterer.forName(wString, Utils
534:                        .partitionOptions(options)));
535:            }
536:
537:            /**
538:             * Gets the current settings of the clusterer.
539:             *
540:             * @return an array of strings suitable for passing to setOptions()
541:             */
542:            public String[] getOptions() {
543:
544:                String[] clustererOptions = new String[0];
545:                if ((m_wrappedClusterer != null)
546:                        && (m_wrappedClusterer instanceof  OptionHandler)) {
547:                    clustererOptions = ((OptionHandler) m_wrappedClusterer)
548:                            .getOptions();
549:                }
550:                String[] options = new String[clustererOptions.length + 5];
551:                int current = 0;
552:
553:                options[current++] = "-M";
554:                options[current++] = "" + getMinStdDev();
555:
556:                if (getClusterer() != null) {
557:                    options[current++] = "-W";
558:                    options[current++] = getClusterer().getClass().getName();
559:                }
560:                options[current++] = "--";
561:
562:                System.arraycopy(clustererOptions, 0, options, current,
563:                        clustererOptions.length);
564:                current += clustererOptions.length;
565:                while (current < options.length) {
566:                    options[current++] = "";
567:                }
568:                return options;
569:            }
570:
571:            /**
572:             * Main method for testing this class.
573:             *
574:             * @param argv the options
575:             */
576:            public static void main(String[] argv) {
577:                runClusterer(new MakeDensityBasedClusterer(), argv);
578:            }
579:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.