001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * ChiSquaredAttributeEval.java
019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.attributeSelection;
024:
025: import weka.core.Capabilities;
026: import weka.core.ContingencyTables;
027: import weka.core.Instance;
028: import weka.core.Instances;
029: import weka.core.Option;
030: import weka.core.OptionHandler;
031: import weka.core.Utils;
032: import weka.core.Capabilities.Capability;
033: import weka.filters.Filter;
034: import weka.filters.supervised.attribute.Discretize;
035: import weka.filters.unsupervised.attribute.NumericToBinary;
036:
037: import java.util.Enumeration;
038: import java.util.Vector;
039:
040: /**
041: <!-- globalinfo-start -->
042: * ChiSquaredAttributeEval :<br/>
043: * <br/>
044: * Evaluates the worth of an attribute by computing the value of the chi-squared statistic with respect to the class.<br/>
045: * <p/>
046: <!-- globalinfo-end -->
047: *
048: <!-- options-start -->
049: * Valid options are: <p/>
050: *
051: * <pre> -M
052: * treat missing values as a seperate value.</pre>
053: *
054: * <pre> -B
055: * just binarize numeric attributes instead
056: * of properly discretizing them.</pre>
057: *
058: <!-- options-end -->
059: *
060: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
061: * @version $Revision: 1.14 $
062: * @see Discretize
063: * @see NumericToBinary
064: */
065: public class ChiSquaredAttributeEval extends AttributeEvaluator
066: implements OptionHandler {
067:
068: /** for serialization */
069: static final long serialVersionUID = -8316857822521717692L;
070:
071: /** Treat missing values as a seperate value */
072: private boolean m_missing_merge;
073:
074: /** Just binarize numeric attributes */
075: private boolean m_Binarize;
076:
077: /** The chi-squared value for each attribute */
078: private double[] m_ChiSquareds;
079:
080: /**
081: * Returns a string describing this attribute evaluator
082: * @return a description of the evaluator suitable for
083: * displaying in the explorer/experimenter gui
084: */
085: public String globalInfo() {
086: return "ChiSquaredAttributeEval :\n\nEvaluates the worth of an attribute "
087: + "by computing the value of the chi-squared statistic with respect to the class.\n";
088: }
089:
090: /**
091: * Constructor
092: */
093: public ChiSquaredAttributeEval() {
094: resetOptions();
095: }
096:
097: /**
098: * Returns an enumeration describing the available options
099: * @return an enumeration of all the available options
100: **/
101: public Enumeration listOptions() {
102: Vector newVector = new Vector(2);
103: newVector.addElement(new Option(
104: "\ttreat missing values as a seperate " + "value.",
105: "M", 0, "-M"));
106: newVector.addElement(new Option(
107: "\tjust binarize numeric attributes instead \n"
108: + "\tof properly discretizing them.", "B", 0,
109: "-B"));
110: return newVector.elements();
111: }
112:
113: /**
114: * Parses a given list of options. <p/>
115: *
116: <!-- options-start -->
117: * Valid options are: <p/>
118: *
119: * <pre> -M
120: * treat missing values as a seperate value.</pre>
121: *
122: * <pre> -B
123: * just binarize numeric attributes instead
124: * of properly discretizing them.</pre>
125: *
126: <!-- options-end -->
127: *
128: * @param options the list of options as an array of strings
129: * @throws Exception if an option is not supported
130: */
131: public void setOptions(String[] options) throws Exception {
132:
133: resetOptions();
134: setMissingMerge(!(Utils.getFlag('M', options)));
135: setBinarizeNumericAttributes(Utils.getFlag('B', options));
136: }
137:
138: /**
139: * Gets the current settings.
140: *
141: * @return an array of strings suitable for passing to setOptions()
142: */
143: public String[] getOptions() {
144: String[] options = new String[2];
145: int current = 0;
146:
147: if (!getMissingMerge()) {
148: options[current++] = "-M";
149: }
150: if (getBinarizeNumericAttributes()) {
151: options[current++] = "-B";
152: }
153:
154: while (current < options.length) {
155: options[current++] = "";
156: }
157:
158: return options;
159: }
160:
161: /**
162: * Returns the tip text for this property
163: * @return tip text for this property suitable for
164: * displaying in the explorer/experimenter gui
165: */
166: public String binarizeNumericAttributesTipText() {
167: return "Just binarize numeric attributes instead of properly discretizing them.";
168: }
169:
170: /**
171: * Binarize numeric attributes.
172: *
173: * @param b true=binarize numeric attributes
174: */
175: public void setBinarizeNumericAttributes(boolean b) {
176: m_Binarize = b;
177: }
178:
179: /**
180: * get whether numeric attributes are just being binarized.
181: *
182: * @return true if missing values are being distributed.
183: */
184: public boolean getBinarizeNumericAttributes() {
185: return m_Binarize;
186: }
187:
188: /**
189: * Returns the tip text for this property
190: * @return tip text for this property suitable for
191: * displaying in the explorer/experimenter gui
192: */
193: public String missingMergeTipText() {
194: return "Distribute counts for missing values. Counts are distributed "
195: + "across other values in proportion to their frequency. Otherwise, "
196: + "missing is treated as a separate value.";
197: }
198:
199: /**
200: * distribute the counts for missing values across observed values
201: *
202: * @param b true=distribute missing values.
203: */
204: public void setMissingMerge(boolean b) {
205: m_missing_merge = b;
206: }
207:
208: /**
209: * get whether missing values are being distributed or not
210: *
211: * @return true if missing values are being distributed.
212: */
213: public boolean getMissingMerge() {
214: return m_missing_merge;
215: }
216:
217: /**
218: * Returns the capabilities of this evaluator.
219: *
220: * @return the capabilities of this evaluator
221: * @see Capabilities
222: */
223: public Capabilities getCapabilities() {
224: Capabilities result = super .getCapabilities();
225:
226: // attributes
227: result.enable(Capability.NOMINAL_ATTRIBUTES);
228: result.enable(Capability.NUMERIC_ATTRIBUTES);
229: result.enable(Capability.DATE_ATTRIBUTES);
230: result.enable(Capability.MISSING_VALUES);
231:
232: // class
233: result.enable(Capability.NOMINAL_CLASS);
234: result.enable(Capability.MISSING_CLASS_VALUES);
235:
236: return result;
237: }
238:
239: /**
240: * Initializes a chi-squared attribute evaluator.
241: * Discretizes all attributes that are numeric.
242: *
243: * @param data set of instances serving as training data
244: * @throws Exception if the evaluator has not been
245: * generated successfully
246: */
247: public void buildEvaluator(Instances data) throws Exception {
248:
249: // can evaluator handle data?
250: getCapabilities().testWithFail(data);
251:
252: int classIndex = data.classIndex();
253: int numInstances = data.numInstances();
254:
255: if (!m_Binarize) {
256: Discretize disTransform = new Discretize();
257: disTransform.setUseBetterEncoding(true);
258: disTransform.setInputFormat(data);
259: data = Filter.useFilter(data, disTransform);
260: } else {
261: NumericToBinary binTransform = new NumericToBinary();
262: binTransform.setInputFormat(data);
263: data = Filter.useFilter(data, binTransform);
264: }
265: int numClasses = data.attribute(classIndex).numValues();
266:
267: // Reserve space and initialize counters
268: double[][][] counts = new double[data.numAttributes()][][];
269: for (int k = 0; k < data.numAttributes(); k++) {
270: if (k != classIndex) {
271: int numValues = data.attribute(k).numValues();
272: counts[k] = new double[numValues + 1][numClasses + 1];
273: }
274: }
275:
276: // Initialize counters
277: double[] temp = new double[numClasses + 1];
278: for (int k = 0; k < numInstances; k++) {
279: Instance inst = data.instance(k);
280: if (inst.classIsMissing()) {
281: temp[numClasses] += inst.weight();
282: } else {
283: temp[(int) inst.classValue()] += inst.weight();
284: }
285: }
286: for (int k = 0; k < counts.length; k++) {
287: if (k != classIndex) {
288: for (int i = 0; i < temp.length; i++) {
289: counts[k][0][i] = temp[i];
290: }
291: }
292: }
293:
294: // Get counts
295: for (int k = 0; k < numInstances; k++) {
296: Instance inst = data.instance(k);
297: for (int i = 0; i < inst.numValues(); i++) {
298: if (inst.index(i) != classIndex) {
299: if (inst.isMissingSparse(i)
300: || inst.classIsMissing()) {
301: if (!inst.isMissingSparse(i)) {
302: counts[inst.index(i)][(int) inst
303: .valueSparse(i)][numClasses] += inst
304: .weight();
305: counts[inst.index(i)][0][numClasses] -= inst
306: .weight();
307: } else if (!inst.classIsMissing()) {
308: counts[inst.index(i)][data.attribute(
309: inst.index(i)).numValues()][(int) inst
310: .classValue()] += inst.weight();
311: counts[inst.index(i)][0][(int) inst
312: .classValue()] -= inst.weight();
313: } else {
314: counts[inst.index(i)][data.attribute(
315: inst.index(i)).numValues()][numClasses] += inst
316: .weight();
317: counts[inst.index(i)][0][numClasses] -= inst
318: .weight();
319: }
320: } else {
321: counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst
322: .classValue()] += inst.weight();
323: counts[inst.index(i)][0][(int) inst
324: .classValue()] -= inst.weight();
325: }
326: }
327: }
328: }
329:
330: // distribute missing counts if required
331: if (m_missing_merge) {
332:
333: for (int k = 0; k < data.numAttributes(); k++) {
334: if (k != classIndex) {
335: int numValues = data.attribute(k).numValues();
336:
337: // Compute marginals
338: double[] rowSums = new double[numValues];
339: double[] columnSums = new double[numClasses];
340: double sum = 0;
341: for (int i = 0; i < numValues; i++) {
342: for (int j = 0; j < numClasses; j++) {
343: rowSums[i] += counts[k][i][j];
344: columnSums[j] += counts[k][i][j];
345: }
346: sum += rowSums[i];
347: }
348:
349: if (Utils.gr(sum, 0)) {
350: double[][] additions = new double[numValues][numClasses];
351:
352: // Compute what needs to be added to each row
353: for (int i = 0; i < numValues; i++) {
354: for (int j = 0; j < numClasses; j++) {
355: additions[i][j] = (rowSums[i] / sum)
356: * counts[k][numValues][j];
357: }
358: }
359:
360: // Compute what needs to be added to each column
361: for (int i = 0; i < numClasses; i++) {
362: for (int j = 0; j < numValues; j++) {
363: additions[j][i] += (columnSums[i] / sum)
364: * counts[k][j][numClasses];
365: }
366: }
367:
368: // Compute what needs to be added to each cell
369: for (int i = 0; i < numClasses; i++) {
370: for (int j = 0; j < numValues; j++) {
371: additions[j][i] += (counts[k][j][i] / sum)
372: * counts[k][numValues][numClasses];
373: }
374: }
375:
376: // Make new contingency table
377: double[][] newTable = new double[numValues][numClasses];
378: for (int i = 0; i < numValues; i++) {
379: for (int j = 0; j < numClasses; j++) {
380: newTable[i][j] = counts[k][i][j]
381: + additions[i][j];
382: }
383: }
384: counts[k] = newTable;
385: }
386: }
387: }
388: }
389:
390: // Compute chi-squared values
391: m_ChiSquareds = new double[data.numAttributes()];
392: for (int i = 0; i < data.numAttributes(); i++) {
393: if (i != classIndex) {
394: m_ChiSquareds[i] = ContingencyTables.chiVal(
395: ContingencyTables.reduceMatrix(counts[i]),
396: false);
397: }
398: }
399: }
400:
401: /**
402: * Reset options to their default values
403: */
404: protected void resetOptions() {
405: m_ChiSquareds = null;
406: m_missing_merge = true;
407: m_Binarize = false;
408: }
409:
410: /**
411: * evaluates an individual attribute by measuring its
412: * chi-squared value.
413: *
414: * @param attribute the index of the attribute to be evaluated
415: * @return the chi-squared value
416: * @throws Exception if the attribute could not be evaluated
417: */
418: public double evaluateAttribute(int attribute) throws Exception {
419:
420: return m_ChiSquareds[attribute];
421: }
422:
423: /**
424: * Describe the attribute evaluator
425: * @return a description of the attribute evaluator as a string
426: */
427: public String toString() {
428: StringBuffer text = new StringBuffer();
429:
430: if (m_ChiSquareds == null) {
431: text
432: .append("Chi-squared attribute evaluator has not been built");
433: } else {
434: text.append("\tChi-squared Ranking Filter");
435: if (!m_missing_merge) {
436: text.append("\n\tMissing values treated as seperate");
437: }
438: if (m_Binarize) {
439: text
440: .append("\n\tNumeric attributes are just binarized");
441: }
442: }
443:
444: text.append("\n");
445: return text.toString();
446: }
447:
448: /**
449: * Main method.
450: *
451: * @param args the options
452: */
453: public static void main(String[] args) {
454: runEvaluator(new ChiSquaredAttributeEval(), args);
455: }
456: }
|