0001: /*
0002: * This program is free software; you can redistribute it and/or modify
0003: * it under the terms of the GNU General Public License as published by
0004: * the Free Software Foundation; either version 2 of the License, or
0005: * (at your option) any later version.
0006: *
0007: * This program is distributed in the hope that it will be useful,
0008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0010: * GNU General Public License for more details.
0011: *
0012: * You should have received a copy of the GNU General Public License
0013: * along with this program; if not, write to the Free Software
0014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0015: */
0016:
0017: /*
0018: * Discretize.java
0019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
0020: *
0021: */
0022:
0023: package weka.filters.supervised.attribute;
0024:
0025: import weka.core.Attribute;
0026: import weka.core.Capabilities;
0027: import weka.core.ContingencyTables;
0028: import weka.core.FastVector;
0029: import weka.core.Instance;
0030: import weka.core.Instances;
0031: import weka.core.Option;
0032: import weka.core.OptionHandler;
0033: import weka.core.Range;
0034: import weka.core.SparseInstance;
0035: import weka.core.SpecialFunctions;
0036: import weka.core.TechnicalInformation;
0037: import weka.core.TechnicalInformationHandler;
0038: import weka.core.Utils;
0039: import weka.core.WeightedInstancesHandler;
0040: import weka.core.Capabilities.Capability;
0041: import weka.core.TechnicalInformation.Field;
0042: import weka.core.TechnicalInformation.Type;
0043: import weka.filters.Filter;
0044: import weka.filters.SupervisedFilter;
0045:
0046: import java.util.Enumeration;
0047: import java.util.Vector;
0048:
0049: /**
0050: <!-- globalinfo-start -->
0051: * An instance filter that discretizes a range of numeric attributes in the dataset into nominal attributes. Discretization is by Fayyad & Irani's MDL method (the default).<br/>
0052: * <br/>
0053: * For more information, see:<br/>
0054: * <br/>
0055: * Usama M. Fayyad, Keki B. Irani: Multi-interval discretization of continuousvalued attributes for classification learning. In: Thirteenth International Joint Conference on Articial Intelligence, 1022-1027, 1993.<br/>
0056: * <br/>
0057: * Igor Kononenko: On Biases in Estimating Multi-Valued Attributes. In: 14th International Joint Conference on Articial Intelligence, 1034-1040, 1995.
0058: * <p/>
0059: <!-- globalinfo-end -->
0060: *
0061: <!-- technical-bibtex-start -->
0062: * BibTeX:
0063: * <pre>
0064: * @inproceedings{Fayyad1993,
0065: * author = {Usama M. Fayyad and Keki B. Irani},
0066: * booktitle = {Thirteenth International Joint Conference on Articial Intelligence},
0067: * pages = {1022-1027},
0068: * publisher = {Morgan Kaufmann Publishers},
0069: * title = {Multi-interval discretization of continuousvalued attributes for classification learning},
0070: * volume = {2},
0071: * year = {1993}
0072: * }
0073: *
0074: * @inproceedings{Kononenko1995,
0075: * author = {Igor Kononenko},
0076: * booktitle = {14th International Joint Conference on Articial Intelligence},
0077: * pages = {1034-1040},
0078: * title = {On Biases in Estimating Multi-Valued Attributes},
0079: * year = {1995},
0080: * PS = {http://ai.fri.uni-lj.si/papers/kononenko95-ijcai.ps.gz}
0081: * }
0082: * </pre>
0083: * <p/>
0084: <!-- technical-bibtex-end -->
0085: *
0086: <!-- options-start -->
0087: * Valid options are: <p/>
0088: *
0089: * <pre> -R <col1,col2-col4,...>
0090: * Specifies list of columns to Discretize. First and last are valid indexes.
0091: * (default none)</pre>
0092: *
0093: * <pre> -V
0094: * Invert matching sense of column indexes.</pre>
0095: *
0096: * <pre> -D
0097: * Output binary attributes for discretized attributes.</pre>
0098: *
0099: * <pre> -E
0100: * Use better encoding of split point for MDL.</pre>
0101: *
0102: * <pre> -K
0103: * Use Kononenko's MDL criterion.</pre>
0104: *
0105: <!-- options-end -->
0106: *
0107: * @author Len Trigg (trigg@cs.waikato.ac.nz)
0108: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
0109: * @version $Revision: 1.7 $
0110: */
0111: public class Discretize extends Filter implements SupervisedFilter,
0112: OptionHandler, WeightedInstancesHandler,
0113: TechnicalInformationHandler {
0114:
0115: /** for serialization */
0116: static final long serialVersionUID = -3141006402280129097L;
0117:
0118: /** Stores which columns to Discretize */
0119: protected Range m_DiscretizeCols = new Range();
0120:
0121: /** Store the current cutpoints */
0122: protected double[][] m_CutPoints = null;
0123:
0124: /** Output binary attributes for discretized attributes. */
0125: protected boolean m_MakeBinary = false;
0126:
0127: /** Use better encoding of split point for MDL. */
0128: protected boolean m_UseBetterEncoding = false;
0129:
0130: /** Use Kononenko's MDL criterion instead of Fayyad et al.'s */
0131: protected boolean m_UseKononenko = false;
0132:
0133: /** Constructor - initialises the filter */
0134: public Discretize() {
0135:
0136: setAttributeIndices("first-last");
0137: }
0138:
0139: /**
0140: * Gets an enumeration describing the available options.
0141: *
0142: * @return an enumeration of all the available options.
0143: */
0144: public Enumeration listOptions() {
0145:
0146: Vector newVector = new Vector(7);
0147:
0148: newVector.addElement(new Option(
0149: "\tSpecifies list of columns to Discretize. First"
0150: + " and last are valid indexes.\n"
0151: + "\t(default none)", "R", 1,
0152: "-R <col1,col2-col4,...>"));
0153:
0154: newVector.addElement(new Option(
0155: "\tInvert matching sense of column indexes.", "V", 0,
0156: "-V"));
0157:
0158: newVector
0159: .addElement(new Option(
0160: "\tOutput binary attributes for discretized attributes.",
0161: "D", 0, "-D"));
0162:
0163: newVector.addElement(new Option(
0164: "\tUse better encoding of split point for MDL.", "E",
0165: 0, "-E"));
0166:
0167: newVector.addElement(new Option(
0168: "\tUse Kononenko's MDL criterion.", "K", 0, "-K"));
0169:
0170: return newVector.elements();
0171: }
0172:
0173: /**
0174: * Parses a given list of options. <p/>
0175: *
0176: <!-- options-start -->
0177: * Valid options are: <p/>
0178: *
0179: * <pre> -R <col1,col2-col4,...>
0180: * Specifies list of columns to Discretize. First and last are valid indexes.
0181: * (default none)</pre>
0182: *
0183: * <pre> -V
0184: * Invert matching sense of column indexes.</pre>
0185: *
0186: * <pre> -D
0187: * Output binary attributes for discretized attributes.</pre>
0188: *
0189: * <pre> -E
0190: * Use better encoding of split point for MDL.</pre>
0191: *
0192: * <pre> -K
0193: * Use Kononenko's MDL criterion.</pre>
0194: *
0195: <!-- options-end -->
0196: *
0197: * @param options the list of options as an array of strings
0198: * @throws Exception if an option is not supported
0199: */
0200: public void setOptions(String[] options) throws Exception {
0201:
0202: setMakeBinary(Utils.getFlag('D', options));
0203: setUseBetterEncoding(Utils.getFlag('E', options));
0204: setUseKononenko(Utils.getFlag('K', options));
0205: setInvertSelection(Utils.getFlag('V', options));
0206:
0207: String convertList = Utils.getOption('R', options);
0208: if (convertList.length() != 0) {
0209: setAttributeIndices(convertList);
0210: } else {
0211: setAttributeIndices("first-last");
0212: }
0213:
0214: if (getInputFormat() != null) {
0215: setInputFormat(getInputFormat());
0216: }
0217: }
0218:
0219: /**
0220: * Gets the current settings of the filter.
0221: *
0222: * @return an array of strings suitable for passing to setOptions
0223: */
0224: public String[] getOptions() {
0225:
0226: String[] options = new String[12];
0227: int current = 0;
0228:
0229: if (getMakeBinary()) {
0230: options[current++] = "-D";
0231: }
0232: if (getUseBetterEncoding()) {
0233: options[current++] = "-E";
0234: }
0235: if (getUseKononenko()) {
0236: options[current++] = "-K";
0237: }
0238: if (getInvertSelection()) {
0239: options[current++] = "-V";
0240: }
0241: if (!getAttributeIndices().equals("")) {
0242: options[current++] = "-R";
0243: options[current++] = getAttributeIndices();
0244: }
0245: while (current < options.length) {
0246: options[current++] = "";
0247: }
0248: return options;
0249: }
0250:
0251: /**
0252: * Returns the Capabilities of this filter.
0253: *
0254: * @return the capabilities of this object
0255: * @see Capabilities
0256: */
0257: public Capabilities getCapabilities() {
0258: Capabilities result = super .getCapabilities();
0259:
0260: // attributes
0261: result.enableAllAttributes();
0262: result.enable(Capability.MISSING_VALUES);
0263:
0264: // class
0265: result.enable(Capability.NOMINAL_CLASS);
0266:
0267: return result;
0268: }
0269:
0270: /**
0271: * Sets the format of the input instances.
0272: *
0273: * @param instanceInfo an Instances object containing the input instance
0274: * structure (any instances contained in the object are ignored - only the
0275: * structure is required).
0276: * @return true if the outputFormat may be collected immediately
0277: * @throws Exception if the input format can't be set successfully
0278: */
0279: public boolean setInputFormat(Instances instanceInfo)
0280: throws Exception {
0281:
0282: super .setInputFormat(instanceInfo);
0283:
0284: m_DiscretizeCols.setUpper(instanceInfo.numAttributes() - 1);
0285: m_CutPoints = null;
0286:
0287: // If we implement loading cutfiles, then load
0288: //them here and set the output format
0289: return false;
0290: }
0291:
0292: /**
0293: * Input an instance for filtering. Ordinarily the instance is processed
0294: * and made available for output immediately. Some filters require all
0295: * instances be read before producing output.
0296: *
0297: * @param instance the input instance
0298: * @return true if the filtered instance may now be
0299: * collected with output().
0300: * @throws IllegalStateException if no input format has been defined.
0301: */
0302: public boolean input(Instance instance) {
0303:
0304: if (getInputFormat() == null) {
0305: throw new IllegalStateException(
0306: "No input instance format defined");
0307: }
0308: if (m_NewBatch) {
0309: resetQueue();
0310: m_NewBatch = false;
0311: }
0312:
0313: if (m_CutPoints != null) {
0314: convertInstance(instance);
0315: return true;
0316: }
0317:
0318: bufferInput(instance);
0319: return false;
0320: }
0321:
0322: /**
0323: * Signifies that this batch of input to the filter is finished. If the
0324: * filter requires all instances prior to filtering, output() may now
0325: * be called to retrieve the filtered instances.
0326: *
0327: * @return true if there are instances pending output
0328: * @throws IllegalStateException if no input structure has been defined
0329: */
0330: public boolean batchFinished() {
0331:
0332: if (getInputFormat() == null) {
0333: throw new IllegalStateException(
0334: "No input instance format defined");
0335: }
0336: if (m_CutPoints == null) {
0337: calculateCutPoints();
0338:
0339: setOutputFormat();
0340:
0341: // If we implement saving cutfiles, save the cuts here
0342:
0343: // Convert pending input instances
0344: for (int i = 0; i < getInputFormat().numInstances(); i++) {
0345: convertInstance(getInputFormat().instance(i));
0346: }
0347: }
0348: flushInput();
0349:
0350: m_NewBatch = true;
0351: return (numPendingOutput() != 0);
0352: }
0353:
0354: /**
0355: * Returns a string describing this filter
0356: *
0357: * @return a description of the filter suitable for
0358: * displaying in the explorer/experimenter gui
0359: */
0360: public String globalInfo() {
0361:
0362: return "An instance filter that discretizes a range of numeric"
0363: + " attributes in the dataset into nominal attributes."
0364: + " Discretization is by Fayyad & Irani's MDL method (the default).\n\n"
0365: + "For more information, see:\n\n"
0366: + getTechnicalInformation().toString();
0367: }
0368:
0369: /**
0370: * Returns an instance of a TechnicalInformation object, containing
0371: * detailed information about the technical background of this class,
0372: * e.g., paper reference or book this class is based on.
0373: *
0374: * @return the technical information about this class
0375: */
0376: public TechnicalInformation getTechnicalInformation() {
0377: TechnicalInformation result;
0378: TechnicalInformation additional;
0379:
0380: result = new TechnicalInformation(Type.INPROCEEDINGS);
0381: result.setValue(Field.AUTHOR,
0382: "Usama M. Fayyad and Keki B. Irani");
0383: result
0384: .setValue(
0385: Field.TITLE,
0386: "Multi-interval discretization of continuousvalued attributes for classification learning");
0387: result
0388: .setValue(Field.BOOKTITLE,
0389: "Thirteenth International Joint Conference on Articial Intelligence");
0390: result.setValue(Field.YEAR, "1993");
0391: result.setValue(Field.VOLUME, "2");
0392: result.setValue(Field.PAGES, "1022-1027");
0393: result.setValue(Field.PUBLISHER, "Morgan Kaufmann Publishers");
0394:
0395: additional = result.add(Type.INPROCEEDINGS);
0396: additional.setValue(Field.AUTHOR, "Igor Kononenko");
0397: additional.setValue(Field.TITLE,
0398: "On Biases in Estimating Multi-Valued Attributes");
0399: additional
0400: .setValue(Field.BOOKTITLE,
0401: "14th International Joint Conference on Articial Intelligence");
0402: additional.setValue(Field.YEAR, "1995");
0403: additional.setValue(Field.PAGES, "1034-1040");
0404: additional
0405: .setValue(Field.PS,
0406: "http://ai.fri.uni-lj.si/papers/kononenko95-ijcai.ps.gz");
0407:
0408: return result;
0409: }
0410:
0411: /**
0412: * Returns the tip text for this property
0413: *
0414: * @return tip text for this property suitable for
0415: * displaying in the explorer/experimenter gui
0416: */
0417: public String makeBinaryTipText() {
0418:
0419: return "Make resulting attributes binary.";
0420: }
0421:
0422: /**
0423: * Gets whether binary attributes should be made for discretized ones.
0424: *
0425: * @return true if attributes will be binarized
0426: */
0427: public boolean getMakeBinary() {
0428:
0429: return m_MakeBinary;
0430: }
0431:
0432: /**
0433: * Sets whether binary attributes should be made for discretized ones.
0434: *
0435: * @param makeBinary if binary attributes are to be made
0436: */
0437: public void setMakeBinary(boolean makeBinary) {
0438:
0439: m_MakeBinary = makeBinary;
0440: }
0441:
0442: /**
0443: * Returns the tip text for this property
0444: *
0445: * @return tip text for this property suitable for
0446: * displaying in the explorer/experimenter gui
0447: */
0448: public String useKononenkoTipText() {
0449:
0450: return "Use Kononenko's MDL criterion. If set to false"
0451: + " uses the Fayyad & Irani criterion.";
0452: }
0453:
0454: /**
0455: * Gets whether Kononenko's MDL criterion is to be used.
0456: *
0457: * @return true if Kononenko's criterion will be used.
0458: */
0459: public boolean getUseKononenko() {
0460:
0461: return m_UseKononenko;
0462: }
0463:
0464: /**
0465: * Sets whether Kononenko's MDL criterion is to be used.
0466: *
0467: * @param useKon true if Kononenko's one is to be used
0468: */
0469: public void setUseKononenko(boolean useKon) {
0470:
0471: m_UseKononenko = useKon;
0472: }
0473:
0474: /**
0475: * Returns the tip text for this property
0476: *
0477: * @return tip text for this property suitable for
0478: * displaying in the explorer/experimenter gui
0479: */
0480: public String useBetterEncodingTipText() {
0481:
0482: return "Uses a more efficient split point encoding.";
0483: }
0484:
0485: /**
0486: * Gets whether better encoding is to be used for MDL.
0487: *
0488: * @return true if the better MDL encoding will be used
0489: */
0490: public boolean getUseBetterEncoding() {
0491:
0492: return m_UseBetterEncoding;
0493: }
0494:
0495: /**
0496: * Sets whether better encoding is to be used for MDL.
0497: *
0498: * @param useBetterEncoding true if better encoding to be used.
0499: */
0500: public void setUseBetterEncoding(boolean useBetterEncoding) {
0501:
0502: m_UseBetterEncoding = useBetterEncoding;
0503: }
0504:
0505: /**
0506: * Returns the tip text for this property
0507: *
0508: * @return tip text for this property suitable for
0509: * displaying in the explorer/experimenter gui
0510: */
0511: public String invertSelectionTipText() {
0512:
0513: return "Set attribute selection mode. If false, only selected"
0514: + " (numeric) attributes in the range will be discretized; if"
0515: + " true, only non-selected attributes will be discretized.";
0516: }
0517:
0518: /**
0519: * Gets whether the supplied columns are to be removed or kept
0520: *
0521: * @return true if the supplied columns will be kept
0522: */
0523: public boolean getInvertSelection() {
0524:
0525: return m_DiscretizeCols.getInvert();
0526: }
0527:
0528: /**
0529: * Sets whether selected columns should be removed or kept. If true the
0530: * selected columns are kept and unselected columns are deleted. If false
0531: * selected columns are deleted and unselected columns are kept.
0532: *
0533: * @param invert the new invert setting
0534: */
0535: public void setInvertSelection(boolean invert) {
0536:
0537: m_DiscretizeCols.setInvert(invert);
0538: }
0539:
0540: /**
0541: * Returns the tip text for this property
0542: *
0543: * @return tip text for this property suitable for
0544: * displaying in the explorer/experimenter gui
0545: */
0546: public String attributeIndicesTipText() {
0547: return "Specify range of attributes to act on."
0548: + " This is a comma separated list of attribute indices, with"
0549: + " \"first\" and \"last\" valid values. Specify an inclusive"
0550: + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
0551: }
0552:
0553: /**
0554: * Gets the current range selection
0555: *
0556: * @return a string containing a comma separated list of ranges
0557: */
0558: public String getAttributeIndices() {
0559:
0560: return m_DiscretizeCols.getRanges();
0561: }
0562:
0563: /**
0564: * Sets which attributes are to be Discretized (only numeric
0565: * attributes among the selection will be Discretized).
0566: *
0567: * @param rangeList a string representing the list of attributes. Since
0568: * the string will typically come from a user, attributes are indexed from
0569: * 1. <br>
0570: * eg: first-3,5,6-last
0571: * @throws IllegalArgumentException if an invalid range list is supplied
0572: */
0573: public void setAttributeIndices(String rangeList) {
0574:
0575: m_DiscretizeCols.setRanges(rangeList);
0576: }
0577:
0578: /**
0579: * Sets which attributes are to be Discretized (only numeric
0580: * attributes among the selection will be Discretized).
0581: *
0582: * @param attributes an array containing indexes of attributes to Discretize.
0583: * Since the array will typically come from a program, attributes are indexed
0584: * from 0.
0585: * @throws IllegalArgumentException if an invalid set of ranges
0586: * is supplied
0587: */
0588: public void setAttributeIndicesArray(int[] attributes) {
0589:
0590: setAttributeIndices(Range.indicesToRangeList(attributes));
0591: }
0592:
0593: /**
0594: * Gets the cut points for an attribute
0595: *
0596: * @param attributeIndex the index (from 0) of the attribute to get the cut points of
0597: * @return an array containing the cutpoints (or null if the
0598: * attribute requested isn't being Discretized
0599: */
0600: public double[] getCutPoints(int attributeIndex) {
0601:
0602: if (m_CutPoints == null) {
0603: return null;
0604: }
0605: return m_CutPoints[attributeIndex];
0606: }
0607:
0608: /** Generate the cutpoints for each attribute */
0609: protected void calculateCutPoints() {
0610:
0611: Instances copy = null;
0612:
0613: m_CutPoints = new double[getInputFormat().numAttributes()][];
0614: for (int i = getInputFormat().numAttributes() - 1; i >= 0; i--) {
0615: if ((m_DiscretizeCols.isInRange(i))
0616: && (getInputFormat().attribute(i).isNumeric())) {
0617:
0618: // Use copy to preserve order
0619: if (copy == null) {
0620: copy = new Instances(getInputFormat());
0621: }
0622: calculateCutPointsByMDL(i, copy);
0623: }
0624: }
0625: }
0626:
0627: /**
0628: * Set cutpoints for a single attribute using MDL.
0629: *
0630: * @param index the index of the attribute to set cutpoints for
0631: * @param data the data to work with
0632: */
0633: protected void calculateCutPointsByMDL(int index, Instances data) {
0634:
0635: // Sort instances
0636: data.sort(data.attribute(index));
0637:
0638: // Find first instances that's missing
0639: int firstMissing = data.numInstances();
0640: for (int i = 0; i < data.numInstances(); i++) {
0641: if (data.instance(i).isMissing(index)) {
0642: firstMissing = i;
0643: break;
0644: }
0645: }
0646: m_CutPoints[index] = cutPointsForSubset(data, index, 0,
0647: firstMissing);
0648: }
0649:
0650: /**
0651: * Test using Kononenko's MDL criterion.
0652: *
0653: * @param priorCounts
0654: * @param bestCounts
0655: * @param numInstances
0656: * @param numCutPoints
0657: * @return true if the split is acceptable
0658: */
0659: private boolean KononenkosMDL(double[] priorCounts,
0660: double[][] bestCounts, double numInstances, int numCutPoints) {
0661:
0662: double distPrior, instPrior, distAfter = 0, sum, instAfter = 0;
0663: double before, after;
0664: int numClassesTotal;
0665:
0666: // Number of classes occuring in the set
0667: numClassesTotal = 0;
0668: for (int i = 0; i < priorCounts.length; i++) {
0669: if (priorCounts[i] > 0) {
0670: numClassesTotal++;
0671: }
0672: }
0673:
0674: // Encode distribution prior to split
0675: distPrior = SpecialFunctions.log2Binomial(numInstances
0676: + numClassesTotal - 1, numClassesTotal - 1);
0677:
0678: // Encode instances prior to split.
0679: instPrior = SpecialFunctions.log2Multinomial(numInstances,
0680: priorCounts);
0681:
0682: before = instPrior + distPrior;
0683:
0684: // Encode distributions and instances after split.
0685: for (int i = 0; i < bestCounts.length; i++) {
0686: sum = Utils.sum(bestCounts[i]);
0687: distAfter += SpecialFunctions.log2Binomial(sum
0688: + numClassesTotal - 1, numClassesTotal - 1);
0689: instAfter += SpecialFunctions.log2Multinomial(sum,
0690: bestCounts[i]);
0691: }
0692:
0693: // Coding cost after split
0694: after = Utils.log2(numCutPoints) + distAfter + instAfter;
0695:
0696: // Check if split is to be accepted
0697: return (before > after);
0698: }
0699:
0700: /**
0701: * Test using Fayyad and Irani's MDL criterion.
0702: *
0703: * @param priorCounts
0704: * @param bestCounts
0705: * @param numInstances
0706: * @param numCutPoints
0707: * @return true if the splits is acceptable
0708: */
0709: private boolean FayyadAndIranisMDL(double[] priorCounts,
0710: double[][] bestCounts, double numInstances, int numCutPoints) {
0711:
0712: double priorEntropy, entropy, gain;
0713: double entropyLeft, entropyRight, delta;
0714: int numClassesTotal, numClassesRight, numClassesLeft;
0715:
0716: // Compute entropy before split.
0717: priorEntropy = ContingencyTables.entropy(priorCounts);
0718:
0719: // Compute entropy after split.
0720: entropy = ContingencyTables
0721: .entropyConditionedOnRows(bestCounts);
0722:
0723: // Compute information gain.
0724: gain = priorEntropy - entropy;
0725:
0726: // Number of classes occuring in the set
0727: numClassesTotal = 0;
0728: for (int i = 0; i < priorCounts.length; i++) {
0729: if (priorCounts[i] > 0) {
0730: numClassesTotal++;
0731: }
0732: }
0733:
0734: // Number of classes occuring in the left subset
0735: numClassesLeft = 0;
0736: for (int i = 0; i < bestCounts[0].length; i++) {
0737: if (bestCounts[0][i] > 0) {
0738: numClassesLeft++;
0739: }
0740: }
0741:
0742: // Number of classes occuring in the right subset
0743: numClassesRight = 0;
0744: for (int i = 0; i < bestCounts[1].length; i++) {
0745: if (bestCounts[1][i] > 0) {
0746: numClassesRight++;
0747: }
0748: }
0749:
0750: // Entropy of the left and the right subsets
0751: entropyLeft = ContingencyTables.entropy(bestCounts[0]);
0752: entropyRight = ContingencyTables.entropy(bestCounts[1]);
0753:
0754: // Compute terms for MDL formula
0755: delta = Utils.log2(Math.pow(3, numClassesTotal) - 2)
0756: - (((double) numClassesTotal * priorEntropy)
0757: - (numClassesRight * entropyRight) - (numClassesLeft * entropyLeft));
0758:
0759: // Check if split is to be accepted
0760: return (gain > (Utils.log2(numCutPoints) + delta)
0761: / (double) numInstances);
0762: }
0763:
0764: /**
0765: * Selects cutpoints for sorted subset.
0766: *
0767: * @param instances
0768: * @param attIndex
0769: * @param first
0770: * @param lastPlusOne
0771: * @return
0772: */
0773: private double[] cutPointsForSubset(Instances instances,
0774: int attIndex, int first, int lastPlusOne) {
0775:
0776: double[][] counts, bestCounts;
0777: double[] priorCounts, left, right, cutPoints;
0778: double currentCutPoint = -Double.MAX_VALUE, bestCutPoint = -1, currentEntropy, bestEntropy, priorEntropy, gain;
0779: int bestIndex = -1, numInstances = 0, numCutPoints = 0;
0780:
0781: // Compute number of instances in set
0782: if ((lastPlusOne - first) < 2) {
0783: return null;
0784: }
0785:
0786: // Compute class counts.
0787: counts = new double[2][instances.numClasses()];
0788: for (int i = first; i < lastPlusOne; i++) {
0789: numInstances += instances.instance(i).weight();
0790: counts[1][(int) instances.instance(i).classValue()] += instances
0791: .instance(i).weight();
0792: }
0793:
0794: // Save prior counts
0795: priorCounts = new double[instances.numClasses()];
0796: System.arraycopy(counts[1], 0, priorCounts, 0, instances
0797: .numClasses());
0798:
0799: // Entropy of the full set
0800: priorEntropy = ContingencyTables.entropy(priorCounts);
0801: bestEntropy = priorEntropy;
0802:
0803: // Find best entropy.
0804: bestCounts = new double[2][instances.numClasses()];
0805: for (int i = first; i < (lastPlusOne - 1); i++) {
0806: counts[0][(int) instances.instance(i).classValue()] += instances
0807: .instance(i).weight();
0808: counts[1][(int) instances.instance(i).classValue()] -= instances
0809: .instance(i).weight();
0810: if (instances.instance(i).value(attIndex) < instances
0811: .instance(i + 1).value(attIndex)) {
0812: currentCutPoint = (instances.instance(i)
0813: .value(attIndex) + instances.instance(i + 1)
0814: .value(attIndex)) / 2.0;
0815: currentEntropy = ContingencyTables
0816: .entropyConditionedOnRows(counts);
0817: if (currentEntropy < bestEntropy) {
0818: bestCutPoint = currentCutPoint;
0819: bestEntropy = currentEntropy;
0820: bestIndex = i;
0821: System.arraycopy(counts[0], 0, bestCounts[0], 0,
0822: instances.numClasses());
0823: System.arraycopy(counts[1], 0, bestCounts[1], 0,
0824: instances.numClasses());
0825: }
0826: numCutPoints++;
0827: }
0828: }
0829:
0830: // Use worse encoding?
0831: if (!m_UseBetterEncoding) {
0832: numCutPoints = (lastPlusOne - first) - 1;
0833: }
0834:
0835: // Checks if gain is zero
0836: gain = priorEntropy - bestEntropy;
0837: if (gain <= 0) {
0838: return null;
0839: }
0840:
0841: // Check if split is to be accepted
0842: if ((m_UseKononenko && KononenkosMDL(priorCounts, bestCounts,
0843: numInstances, numCutPoints))
0844: || (!m_UseKononenko && FayyadAndIranisMDL(priorCounts,
0845: bestCounts, numInstances, numCutPoints))) {
0846:
0847: // Select split points for the left and right subsets
0848: left = cutPointsForSubset(instances, attIndex, first,
0849: bestIndex + 1);
0850: right = cutPointsForSubset(instances, attIndex,
0851: bestIndex + 1, lastPlusOne);
0852:
0853: // Merge cutpoints and return them
0854: if ((left == null) && (right) == null) {
0855: cutPoints = new double[1];
0856: cutPoints[0] = bestCutPoint;
0857: } else if (right == null) {
0858: cutPoints = new double[left.length + 1];
0859: System.arraycopy(left, 0, cutPoints, 0, left.length);
0860: cutPoints[left.length] = bestCutPoint;
0861: } else if (left == null) {
0862: cutPoints = new double[1 + right.length];
0863: cutPoints[0] = bestCutPoint;
0864: System.arraycopy(right, 0, cutPoints, 1, right.length);
0865: } else {
0866: cutPoints = new double[left.length + right.length + 1];
0867: System.arraycopy(left, 0, cutPoints, 0, left.length);
0868: cutPoints[left.length] = bestCutPoint;
0869: System.arraycopy(right, 0, cutPoints, left.length + 1,
0870: right.length);
0871: }
0872:
0873: return cutPoints;
0874: } else
0875: return null;
0876: }
0877:
0878: /**
0879: * Set the output format. Takes the currently defined cutpoints and
0880: * m_InputFormat and calls setOutputFormat(Instances) appropriately.
0881: */
0882: protected void setOutputFormat() {
0883:
0884: if (m_CutPoints == null) {
0885: setOutputFormat(null);
0886: return;
0887: }
0888: FastVector attributes = new FastVector(getInputFormat()
0889: .numAttributes());
0890: int classIndex = getInputFormat().classIndex();
0891: for (int i = 0; i < getInputFormat().numAttributes(); i++) {
0892: if ((m_DiscretizeCols.isInRange(i))
0893: && (getInputFormat().attribute(i).isNumeric())) {
0894: if (!m_MakeBinary) {
0895: FastVector attribValues = new FastVector(1);
0896: if (m_CutPoints[i] == null) {
0897: attribValues.addElement("'All'");
0898: } else {
0899: for (int j = 0; j <= m_CutPoints[i].length; j++) {
0900: if (j == 0) {
0901: attribValues.addElement("'(-inf-"
0902: + Utils.doubleToString(
0903: m_CutPoints[i][j], 6)
0904: + "]'");
0905: } else if (j == m_CutPoints[i].length) {
0906: attribValues.addElement("'("
0907: + Utils.doubleToString(
0908: m_CutPoints[i][j - 1],
0909: 6) + "-inf)'");
0910: } else {
0911: attribValues.addElement("'("
0912: + Utils.doubleToString(
0913: m_CutPoints[i][j - 1],
0914: 6)
0915: + "-"
0916: + Utils.doubleToString(
0917: m_CutPoints[i][j], 6)
0918: + "]'");
0919: }
0920: }
0921: }
0922: attributes.addElement(new Attribute(
0923: getInputFormat().attribute(i).name(),
0924: attribValues));
0925: } else {
0926: if (m_CutPoints[i] == null) {
0927: FastVector attribValues = new FastVector(1);
0928: attribValues.addElement("'All'");
0929: attributes.addElement(new Attribute(
0930: getInputFormat().attribute(i).name(),
0931: attribValues));
0932: } else {
0933: if (i < getInputFormat().classIndex()) {
0934: classIndex += m_CutPoints[i].length - 1;
0935: }
0936: for (int j = 0; j < m_CutPoints[i].length; j++) {
0937: FastVector attribValues = new FastVector(2);
0938: attribValues.addElement("'(-inf-"
0939: + Utils.doubleToString(
0940: m_CutPoints[i][j], 6)
0941: + "]'");
0942: attribValues.addElement("'("
0943: + Utils.doubleToString(
0944: m_CutPoints[i][j], 6)
0945: + "-inf)'");
0946: attributes.addElement(new Attribute(
0947: getInputFormat().attribute(i)
0948: .name(), attribValues));
0949: }
0950: }
0951: }
0952: } else {
0953: attributes.addElement(getInputFormat().attribute(i)
0954: .copy());
0955: }
0956: }
0957: Instances outputFormat = new Instances(getInputFormat()
0958: .relationName(), attributes, 0);
0959: outputFormat.setClassIndex(classIndex);
0960: setOutputFormat(outputFormat);
0961: }
0962:
0963: /**
0964: * Convert a single instance over. The converted instance is added to
0965: * the end of the output queue.
0966: *
0967: * @param instance the instance to convert
0968: */
0969: protected void convertInstance(Instance instance) {
0970:
0971: int index = 0;
0972: double[] vals = new double[outputFormatPeek().numAttributes()];
0973: // Copy and convert the values
0974: for (int i = 0; i < getInputFormat().numAttributes(); i++) {
0975: if (m_DiscretizeCols.isInRange(i)
0976: && getInputFormat().attribute(i).isNumeric()) {
0977: int j;
0978: double currentVal = instance.value(i);
0979: if (m_CutPoints[i] == null) {
0980: if (instance.isMissing(i)) {
0981: vals[index] = Instance.missingValue();
0982: } else {
0983: vals[index] = 0;
0984: }
0985: index++;
0986: } else {
0987: if (!m_MakeBinary) {
0988: if (instance.isMissing(i)) {
0989: vals[index] = Instance.missingValue();
0990: } else {
0991: for (j = 0; j < m_CutPoints[i].length; j++) {
0992: if (currentVal <= m_CutPoints[i][j]) {
0993: break;
0994: }
0995: }
0996: vals[index] = j;
0997: }
0998: index++;
0999: } else {
1000: for (j = 0; j < m_CutPoints[i].length; j++) {
1001: if (instance.isMissing(i)) {
1002: vals[index] = Instance.missingValue();
1003: } else if (currentVal <= m_CutPoints[i][j]) {
1004: vals[index] = 0;
1005: } else {
1006: vals[index] = 1;
1007: }
1008: index++;
1009: }
1010: }
1011: }
1012: } else {
1013: vals[index] = instance.value(i);
1014: index++;
1015: }
1016: }
1017:
1018: Instance inst = null;
1019: if (instance instanceof SparseInstance) {
1020: inst = new SparseInstance(instance.weight(), vals);
1021: } else {
1022: inst = new Instance(instance.weight(), vals);
1023: }
1024: inst.setDataset(getOutputFormat());
1025: copyValues(inst, false, instance.dataset(), getOutputFormat());
1026: inst.setDataset(getOutputFormat());
1027: push(inst);
1028: }
1029:
1030: /**
1031: * Main method for testing this class.
1032: *
1033: * @param argv should contain arguments to the filter: use -h for help
1034: */
1035: public static void main(String[] argv) {
1036: runFilter(new Discretize(), argv);
1037: }
1038: }
|