001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * NaiveBayes.java
019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.classifiers.bayes;
024:
025: import weka.classifiers.Classifier;
026: import weka.core.Attribute;
027: import weka.core.Capabilities;
028: import weka.core.Instance;
029: import weka.core.Instances;
030: import weka.core.Option;
031: import weka.core.OptionHandler;
032: import weka.core.TechnicalInformation;
033: import weka.core.TechnicalInformationHandler;
034: import weka.core.Utils;
035: import weka.core.WeightedInstancesHandler;
036: import weka.core.Capabilities.Capability;
037: import weka.core.TechnicalInformation.Field;
038: import weka.core.TechnicalInformation.Type;
039: import weka.estimators.DiscreteEstimator;
040: import weka.estimators.Estimator;
041: import weka.estimators.KernelEstimator;
042: import weka.estimators.NormalEstimator;
043:
044: import java.util.Enumeration;
045: import java.util.Vector;
046:
047: /**
048: <!-- globalinfo-start -->
049: * Class for a Naive Bayes classifier using estimator classes. Numeric estimator precision values are chosen based on analysis of the training data. For this reason, the classifier is not an UpdateableClassifier (which in typical usage are initialized with zero training instances) -- if you need the UpdateableClassifier functionality, use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable classifier will use a default precision of 0.1 for numeric attributes when buildClassifier is called with zero training instances.<br/>
050: * <br/>
051: * For more information on Naive Bayes classifiers, see<br/>
052: * <br/>
053: * George H. John, Pat Langley: Estimating Continuous Distributions in Bayesian Classifiers. In: Eleventh Conference on Uncertainty in Artificial Intelligence, San Mateo, 338-345, 1995.
054: * <p/>
055: <!-- globalinfo-end -->
056: *
057: <!-- technical-bibtex-start -->
058: * BibTeX:
059: * <pre>
060: * @inproceedings{John1995,
061: * address = {San Mateo},
062: * author = {George H. John and Pat Langley},
063: * booktitle = {Eleventh Conference on Uncertainty in Artificial Intelligence},
064: * pages = {338-345},
065: * publisher = {Morgan Kaufmann},
066: * title = {Estimating Continuous Distributions in Bayesian Classifiers},
067: * year = {1995}
068: * }
069: * </pre>
070: * <p/>
071: <!-- technical-bibtex-end -->
072: *
073: <!-- options-start -->
074: * Valid options are: <p/>
075: *
076: * <pre> -K
077: * Use kernel density estimator rather than normal
078: * distribution for numeric attributes</pre>
079: *
080: * <pre> -D
081: * Use supervised discretization to process numeric attributes
082: * </pre>
083: *
084: <!-- options-end -->
085: *
086: * @author Len Trigg (trigg@cs.waikato.ac.nz)
087: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
088: * @version $Revision: 1.21 $
089: */
090: public class NaiveBayes extends Classifier implements OptionHandler,
091: WeightedInstancesHandler, TechnicalInformationHandler {
092:
093: /** for serialization */
094: static final long serialVersionUID = 5995231201785697655L;
095:
096: /** The attribute estimators. */
097: protected Estimator[][] m_Distributions;
098:
099: /** The class estimator. */
100: protected Estimator m_ClassDistribution;
101:
102: /**
103: * Whether to use kernel density estimator rather than normal distribution
104: * for numeric attributes
105: */
106: protected boolean m_UseKernelEstimator = false;
107:
108: /**
109: * Whether to use discretization than normal distribution
110: * for numeric attributes
111: */
112: protected boolean m_UseDiscretization = false;
113:
114: /** The number of classes (or 1 for numeric class) */
115: protected int m_NumClasses;
116:
117: /**
118: * The dataset header for the purposes of printing out a semi-intelligible
119: * model
120: */
121: protected Instances m_Instances;
122:
123: /*** The precision parameter used for numeric attributes */
124: protected static final double DEFAULT_NUM_PRECISION = 0.01;
125:
126: /**
127: * The discretization filter.
128: */
129: protected weka.filters.supervised.attribute.Discretize m_Disc = null;
130:
131: /**
132: * Returns a string describing this classifier
133: * @return a description of the classifier suitable for
134: * displaying in the explorer/experimenter gui
135: */
136: public String globalInfo() {
137: return "Class for a Naive Bayes classifier using estimator classes. Numeric"
138: + " estimator precision values are chosen based on analysis of the "
139: + " training data. For this reason, the classifier is not an"
140: + " UpdateableClassifier (which in typical usage are initialized with zero"
141: + " training instances) -- if you need the UpdateableClassifier functionality,"
142: + " use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable"
143: + " classifier will use a default precision of 0.1 for numeric attributes"
144: + " when buildClassifier is called with zero training instances.\n\n"
145: + "For more information on Naive Bayes classifiers, see\n\n"
146: + getTechnicalInformation().toString();
147: }
148:
149: /**
150: * Returns an instance of a TechnicalInformation object, containing
151: * detailed information about the technical background of this class,
152: * e.g., paper reference or book this class is based on.
153: *
154: * @return the technical information about this class
155: */
156: public TechnicalInformation getTechnicalInformation() {
157: TechnicalInformation result;
158:
159: result = new TechnicalInformation(Type.INPROCEEDINGS);
160: result.setValue(Field.AUTHOR, "George H. John and Pat Langley");
161: result
162: .setValue(Field.TITLE,
163: "Estimating Continuous Distributions in Bayesian Classifiers");
164: result
165: .setValue(Field.BOOKTITLE,
166: "Eleventh Conference on Uncertainty in Artificial Intelligence");
167: result.setValue(Field.YEAR, "1995");
168: result.setValue(Field.PAGES, "338-345");
169: result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
170: result.setValue(Field.ADDRESS, "San Mateo");
171:
172: return result;
173: }
174:
175: /**
176: * Returns default capabilities of the classifier.
177: *
178: * @return the capabilities of this classifier
179: */
180: public Capabilities getCapabilities() {
181: Capabilities result = super .getCapabilities();
182:
183: // attributes
184: result.enable(Capability.NOMINAL_ATTRIBUTES);
185: result.enable(Capability.NUMERIC_ATTRIBUTES);
186: result.enable(Capability.MISSING_VALUES);
187:
188: // class
189: result.enable(Capability.NOMINAL_CLASS);
190: result.enable(Capability.MISSING_CLASS_VALUES);
191:
192: // instances
193: result.setMinimumNumberInstances(0);
194:
195: return result;
196: }
197:
198: /**
199: * Generates the classifier.
200: *
201: * @param instances set of instances serving as training data
202: * @exception Exception if the classifier has not been generated
203: * successfully
204: */
205: public void buildClassifier(Instances instances) throws Exception {
206:
207: // can classifier handle the data?
208: getCapabilities().testWithFail(instances);
209:
210: // remove instances with missing class
211: instances = new Instances(instances);
212: instances.deleteWithMissingClass();
213:
214: m_NumClasses = instances.numClasses();
215:
216: // Copy the instances
217: m_Instances = new Instances(instances);
218:
219: // Discretize instances if required
220: if (m_UseDiscretization) {
221: m_Disc = new weka.filters.supervised.attribute.Discretize();
222: m_Disc.setInputFormat(m_Instances);
223: m_Instances = weka.filters.Filter.useFilter(m_Instances,
224: m_Disc);
225: } else {
226: m_Disc = null;
227: }
228:
229: // Reserve space for the distributions
230: m_Distributions = new Estimator[m_Instances.numAttributes() - 1][m_Instances
231: .numClasses()];
232: m_ClassDistribution = new DiscreteEstimator(m_Instances
233: .numClasses(), true);
234: int attIndex = 0;
235: Enumeration enu = m_Instances.enumerateAttributes();
236: while (enu.hasMoreElements()) {
237: Attribute attribute = (Attribute) enu.nextElement();
238:
239: // If the attribute is numeric, determine the estimator
240: // numeric precision from differences between adjacent values
241: double numPrecision = DEFAULT_NUM_PRECISION;
242: if (attribute.type() == Attribute.NUMERIC) {
243: m_Instances.sort(attribute);
244: if ((m_Instances.numInstances() > 0)
245: && !m_Instances.instance(0)
246: .isMissing(attribute)) {
247: double lastVal = m_Instances.instance(0).value(
248: attribute);
249: double currentVal, deltaSum = 0;
250: int distinct = 0;
251: for (int i = 1; i < m_Instances.numInstances(); i++) {
252: Instance currentInst = m_Instances.instance(i);
253: if (currentInst.isMissing(attribute)) {
254: break;
255: }
256: currentVal = currentInst.value(attribute);
257: if (currentVal != lastVal) {
258: deltaSum += currentVal - lastVal;
259: lastVal = currentVal;
260: distinct++;
261: }
262: }
263: if (distinct > 0) {
264: numPrecision = deltaSum / distinct;
265: }
266: }
267: }
268:
269: for (int j = 0; j < m_Instances.numClasses(); j++) {
270: switch (attribute.type()) {
271: case Attribute.NUMERIC:
272: if (m_UseKernelEstimator) {
273: m_Distributions[attIndex][j] = new KernelEstimator(
274: numPrecision);
275: } else {
276: m_Distributions[attIndex][j] = new NormalEstimator(
277: numPrecision);
278: }
279: break;
280: case Attribute.NOMINAL:
281: m_Distributions[attIndex][j] = new DiscreteEstimator(
282: attribute.numValues(), true);
283: break;
284: default:
285: throw new Exception(
286: "Attribute type unknown to NaiveBayes");
287: }
288: }
289: attIndex++;
290: }
291:
292: // Compute counts
293: Enumeration enumInsts = m_Instances.enumerateInstances();
294: while (enumInsts.hasMoreElements()) {
295: Instance instance = (Instance) enumInsts.nextElement();
296: updateClassifier(instance);
297: }
298:
299: // Save space
300: m_Instances = new Instances(m_Instances, 0);
301: }
302:
303: /**
304: * Updates the classifier with the given instance.
305: *
306: * @param instance the new training instance to include in the model
307: * @exception Exception if the instance could not be incorporated in
308: * the model.
309: */
310: public void updateClassifier(Instance instance) throws Exception {
311:
312: if (!instance.classIsMissing()) {
313: Enumeration enumAtts = m_Instances.enumerateAttributes();
314: int attIndex = 0;
315: while (enumAtts.hasMoreElements()) {
316: Attribute attribute = (Attribute) enumAtts
317: .nextElement();
318: if (!instance.isMissing(attribute)) {
319: m_Distributions[attIndex][(int) instance
320: .classValue()].addValue(instance
321: .value(attribute), instance.weight());
322: }
323: attIndex++;
324: }
325: m_ClassDistribution.addValue(instance.classValue(),
326: instance.weight());
327: }
328: }
329:
330: /**
331: * Calculates the class membership probabilities for the given test
332: * instance.
333: *
334: * @param instance the instance to be classified
335: * @return predicted class probability distribution
336: * @exception Exception if there is a problem generating the prediction
337: */
338: public double[] distributionForInstance(Instance instance)
339: throws Exception {
340:
341: if (m_UseDiscretization) {
342: m_Disc.input(instance);
343: instance = m_Disc.output();
344: }
345: double[] probs = new double[m_NumClasses];
346: for (int j = 0; j < m_NumClasses; j++) {
347: probs[j] = m_ClassDistribution.getProbability(j);
348: }
349: Enumeration enumAtts = instance.enumerateAttributes();
350: int attIndex = 0;
351: while (enumAtts.hasMoreElements()) {
352: Attribute attribute = (Attribute) enumAtts.nextElement();
353: if (!instance.isMissing(attribute)) {
354: double temp, max = 0;
355: for (int j = 0; j < m_NumClasses; j++) {
356: temp = Math.max(1e-75, m_Distributions[attIndex][j]
357: .getProbability(instance.value(attribute)));
358: probs[j] *= temp;
359: if (probs[j] > max) {
360: max = probs[j];
361: }
362: if (Double.isNaN(probs[j])) {
363: throw new Exception(
364: "NaN returned from estimator for attribute "
365: + attribute.name()
366: + ":\n"
367: + m_Distributions[attIndex][j]
368: .toString());
369: }
370: }
371: if ((max > 0) && (max < 1e-75)) { // Danger of probability underflow
372: for (int j = 0; j < m_NumClasses; j++) {
373: probs[j] *= 1e75;
374: }
375: }
376: }
377: attIndex++;
378: }
379:
380: // Display probabilities
381: Utils.normalize(probs);
382: return probs;
383: }
384:
385: /**
386: * Returns an enumeration describing the available options.
387: *
388: * @return an enumeration of all the available options.
389: */
390: public Enumeration listOptions() {
391:
392: Vector newVector = new Vector(2);
393:
394: newVector.addElement(new Option(
395: "\tUse kernel density estimator rather than normal\n"
396: + "\tdistribution for numeric attributes", "K",
397: 0, "-K"));
398: newVector
399: .addElement(new Option(
400: "\tUse supervised discretization to process numeric attributes\n",
401: "D", 0, "-D"));
402: return newVector.elements();
403: }
404:
405: /**
406: * Parses a given list of options. <p/>
407: *
408: <!-- options-start -->
409: * Valid options are: <p/>
410: *
411: * <pre> -K
412: * Use kernel density estimator rather than normal
413: * distribution for numeric attributes</pre>
414: *
415: * <pre> -D
416: * Use supervised discretization to process numeric attributes
417: * </pre>
418: *
419: <!-- options-end -->
420: *
421: * @param options the list of options as an array of strings
422: * @exception Exception if an option is not supported
423: */
424: public void setOptions(String[] options) throws Exception {
425:
426: boolean k = Utils.getFlag('K', options);
427: boolean d = Utils.getFlag('D', options);
428: if (k && d) {
429: throw new IllegalArgumentException(
430: "Can't use both kernel density "
431: + "estimation and discretization!");
432: }
433: setUseSupervisedDiscretization(d);
434: setUseKernelEstimator(k);
435: Utils.checkForRemainingOptions(options);
436: }
437:
438: /**
439: * Gets the current settings of the classifier.
440: *
441: * @return an array of strings suitable for passing to setOptions
442: */
443: public String[] getOptions() {
444:
445: String[] options = new String[2];
446: int current = 0;
447:
448: if (m_UseKernelEstimator) {
449: options[current++] = "-K";
450: }
451:
452: if (m_UseDiscretization) {
453: options[current++] = "-D";
454: }
455:
456: while (current < options.length) {
457: options[current++] = "";
458: }
459: return options;
460: }
461:
462: /**
463: * Returns a description of the classifier.
464: *
465: * @return a description of the classifier as a string.
466: */
467: public String toString() {
468:
469: StringBuffer text = new StringBuffer();
470:
471: text.append("Naive Bayes Classifier");
472: if (m_Instances == null) {
473: text.append(": No model built yet.");
474: } else {
475: try {
476: for (int i = 0; i < m_Distributions[0].length; i++) {
477: text.append("\n\nClass "
478: + m_Instances.classAttribute().value(i)
479: + ": Prior probability = "
480: + Utils.doubleToString(m_ClassDistribution
481: .getProbability(i), 4, 2) + "\n\n");
482: Enumeration enumAtts = m_Instances
483: .enumerateAttributes();
484: int attIndex = 0;
485: while (enumAtts.hasMoreElements()) {
486: Attribute attribute = (Attribute) enumAtts
487: .nextElement();
488: text.append(attribute.name() + ": "
489: + m_Distributions[attIndex][i]);
490: attIndex++;
491: }
492: }
493: } catch (Exception ex) {
494: text.append(ex.getMessage());
495: }
496: }
497:
498: return text.toString();
499: }
500:
501: /**
502: * Returns the tip text for this property
503: * @return tip text for this property suitable for
504: * displaying in the explorer/experimenter gui
505: */
506: public String useKernelEstimatorTipText() {
507: return "Use a kernel estimator for numeric attributes rather than a "
508: + "normal distribution.";
509: }
510:
511: /**
512: * Gets if kernel estimator is being used.
513: *
514: * @return Value of m_UseKernelEstimatory.
515: */
516: public boolean getUseKernelEstimator() {
517:
518: return m_UseKernelEstimator;
519: }
520:
521: /**
522: * Sets if kernel estimator is to be used.
523: *
524: * @param v Value to assign to m_UseKernelEstimatory.
525: */
526: public void setUseKernelEstimator(boolean v) {
527:
528: m_UseKernelEstimator = v;
529: if (v) {
530: setUseSupervisedDiscretization(false);
531: }
532: }
533:
534: /**
535: * Returns the tip text for this property
536: * @return tip text for this property suitable for
537: * displaying in the explorer/experimenter gui
538: */
539: public String useSupervisedDiscretizationTipText() {
540: return "Use supervised discretization to convert numeric attributes to nominal "
541: + "ones.";
542: }
543:
544: /**
545: * Get whether supervised discretization is to be used.
546: *
547: * @return true if supervised discretization is to be used.
548: */
549: public boolean getUseSupervisedDiscretization() {
550:
551: return m_UseDiscretization;
552: }
553:
554: /**
555: * Set whether supervised discretization is to be used.
556: *
557: * @param newblah true if supervised discretization is to be used.
558: */
559: public void setUseSupervisedDiscretization(boolean newblah) {
560:
561: m_UseDiscretization = newblah;
562: if (newblah) {
563: setUseKernelEstimator(false);
564: }
565: }
566:
567: /**
568: * Main method for testing this class.
569: *
570: * @param argv the options
571: */
572: public static void main(String[] argv) {
573: runClassifier(new NaiveBayes(), argv);
574: }
575: }
|