001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * NominalToBinary.java
019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.filters.supervised.attribute;
024:
025: import weka.core.Attribute;
026: import weka.core.Capabilities;
027: import weka.core.FastVector;
028: import weka.core.Instance;
029: import weka.core.Instances;
030: import weka.core.Option;
031: import weka.core.OptionHandler;
032: import weka.core.SparseInstance;
033: import weka.core.TechnicalInformation;
034: import weka.core.TechnicalInformationHandler;
035: import weka.core.UnassignedClassException;
036: import weka.core.Utils;
037: import weka.core.Capabilities.Capability;
038: import weka.core.TechnicalInformation.Field;
039: import weka.core.TechnicalInformation.Type;
040: import weka.filters.Filter;
041: import weka.filters.SupervisedFilter;
042:
043: import java.util.Enumeration;
044: import java.util.Vector;
045:
046: /**
047: <!-- globalinfo-start -->
048: * Converts all nominal attributes into binary numeric attributes. An attribute with k values is transformed into k binary attributes if the class is nominal (using the one-attribute-per-value approach). Binary attributes are left binary, if option '-A' is not given.If the class is numeric, k - 1 new binary attributes are generated in the manner described in "Classification and Regression Trees" by Breiman et al. (i.e. taking the average class value associated with each attribute value into account)<br/>
049: * <br/>
050: * For more information, see:<br/>
051: * <br/>
052: * L. Breiman, J.H. Friedman, R.A. Olshen, C.J. Stone (1984). Classification and Regression Trees. Wadsworth Inc.
053: * <p/>
054: <!-- globalinfo-end -->
055: *
056: <!-- technical-bibtex-start -->
057: * BibTeX:
058: * <pre>
059: * @book{Breiman1984,
060: * author = {L. Breiman and J.H. Friedman and R.A. Olshen and C.J. Stone},
061: * publisher = {Wadsworth Inc},
062: * title = {Classification and Regression Trees},
063: * year = {1984},
064: * ISBN = {0412048418}
065: * }
066: * </pre>
067: * <p/>
068: <!-- technical-bibtex-end -->
069: *
070: <!-- options-start -->
071: * Valid options are: <p/>
072: *
073: * <pre> -N
074: * Sets if binary attributes are to be coded as nominal ones.</pre>
075: *
076: * <pre> -A
077: * For each nominal value a new attribute is created,
078: * not only if there are more than 2 values.</pre>
079: *
080: <!-- options-end -->
081: *
082: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
083: * @version $Revision: 1.7 $
084: */
085: public class NominalToBinary extends Filter implements
086: SupervisedFilter, OptionHandler, TechnicalInformationHandler {
087:
088: /** for serialization */
089: static final long serialVersionUID = -5004607029857673950L;
090:
091: /** The sorted indices of the attribute values. */
092: private int[][] m_Indices = null;
093:
094: /** Are the new attributes going to be nominal or numeric ones? */
095: private boolean m_Numeric = true;
096:
097: /** Are all values transformed into new attributes? */
098: private boolean m_TransformAll = false;
099:
100: /**
101: * Returns a string describing this filter
102: *
103: * @return a description of the filter suitable for
104: * displaying in the explorer/experimenter gui
105: */
106: public String globalInfo() {
107:
108: return "Converts all nominal attributes into binary numeric attributes. An "
109: + "attribute with k values is transformed into k binary attributes if "
110: + "the class is nominal (using the one-attribute-per-value approach). "
111: + "Binary attributes are left binary, if option '-A' is not given."
112: + "If the class is numeric, k - 1 new binary attributes are generated "
113: + "in the manner described in \"Classification and Regression "
114: + "Trees\" by Breiman et al. (i.e. taking the average class value associated "
115: + "with each attribute value into account)\n\n"
116: + "For more information, see:\n\n"
117: + getTechnicalInformation().toString();
118: }
119:
120: /**
121: * Returns an instance of a TechnicalInformation object, containing
122: * detailed information about the technical background of this class,
123: * e.g., paper reference or book this class is based on.
124: *
125: * @return the technical information about this class
126: */
127: public TechnicalInformation getTechnicalInformation() {
128: TechnicalInformation result;
129:
130: result = new TechnicalInformation(Type.BOOK);
131: result
132: .setValue(Field.AUTHOR,
133: "L. Breiman and J.H. Friedman and R.A. Olshen and C.J. Stone");
134: result.setValue(Field.TITLE,
135: "Classification and Regression Trees");
136: result.setValue(Field.YEAR, "1984");
137: result.setValue(Field.PUBLISHER, "Wadsworth Inc");
138: result.setValue(Field.ISBN, "0412048418");
139:
140: return result;
141: }
142:
143: /**
144: * Returns the Capabilities of this filter.
145: *
146: * @return the capabilities of this object
147: * @see Capabilities
148: */
149: public Capabilities getCapabilities() {
150: Capabilities result = super .getCapabilities();
151:
152: // attributes
153: result.enableAllAttributes();
154: result.enable(Capability.MISSING_VALUES);
155:
156: // class
157: result.enable(Capability.NUMERIC_CLASS);
158: result.enable(Capability.DATE_CLASS);
159: result.enable(Capability.NOMINAL_CLASS);
160:
161: return result;
162: }
163:
164: /**
165: * Sets the format of the input instances.
166: *
167: * @param instanceInfo an Instances object containing the input
168: * instance structure (any instances contained in the object are
169: * ignored - only the structure is required).
170: * @return true if the outputFormat may be collected immediately
171: * @throws Exception if the input format can't be set
172: * successfully
173: */
174: public boolean setInputFormat(Instances instanceInfo)
175: throws Exception {
176:
177: super .setInputFormat(instanceInfo);
178: if (instanceInfo.classIndex() < 0) {
179: throw new UnassignedClassException(
180: "No class has been assigned to the instances");
181: }
182: setOutputFormat();
183: m_Indices = null;
184: if (instanceInfo.classAttribute().isNominal()) {
185: return true;
186: } else {
187: return false;
188: }
189: }
190:
191: /**
192: * Input an instance for filtering. Filter requires all
193: * training instances be read before producing output.
194: *
195: * @param instance the input instance
196: * @return true if the filtered instance may now be
197: * collected with output().
198: * @throws IllegalStateException if no input format has been set
199: */
200: public boolean input(Instance instance) {
201:
202: if (getInputFormat() == null) {
203: throw new IllegalStateException(
204: "No input instance format defined");
205: }
206: if (m_NewBatch) {
207: resetQueue();
208: m_NewBatch = false;
209: }
210: if ((m_Indices != null)
211: || (getInputFormat().classAttribute().isNominal())) {
212: convertInstance(instance);
213: return true;
214: }
215: bufferInput(instance);
216: return false;
217: }
218:
219: /**
220: * Signify that this batch of input to the filter is finished.
221: * If the filter requires all instances prior to filtering,
222: * output() may now be called to retrieve the filtered instances.
223: *
224: * @return true if there are instances pending output
225: * @throws IllegalStateException if no input structure has been defined
226: */
227: public boolean batchFinished() {
228:
229: if (getInputFormat() == null) {
230: throw new IllegalStateException(
231: "No input instance format defined");
232: }
233: if ((m_Indices == null)
234: && (getInputFormat().classAttribute().isNumeric())) {
235: computeAverageClassValues();
236: setOutputFormat();
237:
238: // Convert pending input instances
239:
240: for (int i = 0; i < getInputFormat().numInstances(); i++) {
241: convertInstance(getInputFormat().instance(i));
242: }
243: }
244: flushInput();
245:
246: m_NewBatch = true;
247: return (numPendingOutput() != 0);
248: }
249:
250: /**
251: * Returns an enumeration describing the available options.
252: *
253: * @return an enumeration of all the available options.
254: */
255: public Enumeration listOptions() {
256:
257: Vector newVector = new Vector(1);
258:
259: newVector
260: .addElement(new Option(
261: "\tSets if binary attributes are to be coded as nominal ones.",
262: "N", 0, "-N"));
263:
264: newVector
265: .addElement(new Option(
266: "\tFor each nominal value a new attribute is created, \n"
267: + "\tnot only if there are more than 2 values.",
268: "A", 0, "-A"));
269:
270: return newVector.elements();
271: }
272:
273: /**
274: * Parses a given list of options. <p/>
275: *
276: <!-- options-start -->
277: * Valid options are: <p/>
278: *
279: * <pre> -N
280: * Sets if binary attributes are to be coded as nominal ones.</pre>
281: *
282: * <pre> -A
283: * For each nominal value a new attribute is created,
284: * not only if there are more than 2 values.</pre>
285: *
286: <!-- options-end -->
287: *
288: * @param options the list of options as an array of strings
289: * @throws Exception if an option is not supported
290: */
291: public void setOptions(String[] options) throws Exception {
292:
293: setBinaryAttributesNominal(Utils.getFlag('N', options));
294:
295: setTransformAllValues(Utils.getFlag('A', options));
296:
297: if (getInputFormat() != null)
298: setInputFormat(getInputFormat());
299: }
300:
301: /**
302: * Gets the current settings of the filter.
303: *
304: * @return an array of strings suitable for passing to setOptions
305: */
306: public String[] getOptions() {
307:
308: String[] options = new String[1];
309: int current = 0;
310:
311: if (getBinaryAttributesNominal()) {
312: options[current++] = "-N";
313: }
314:
315: if (getTransformAllValues()) {
316: options[current++] = "-A";
317: }
318:
319: while (current < options.length) {
320: options[current++] = "";
321: }
322: return options;
323: }
324:
325: /**
326: * Returns the tip text for this property
327: *
328: * @return tip text for this property suitable for
329: * displaying in the explorer/experimenter gui
330: */
331: public String binaryAttributesNominalTipText() {
332: return "Whether resulting binary attributes will be nominal.";
333: }
334:
335: /**
336: * Gets if binary attributes are to be treated as nominal ones.
337: *
338: * @return true if binary attributes are to be treated as nominal ones
339: */
340: public boolean getBinaryAttributesNominal() {
341:
342: return !m_Numeric;
343: }
344:
345: /**
346: * Sets if binary attributes are to be treates as nominal ones.
347: *
348: * @param bool true if binary attributes are to be treated as nominal ones
349: */
350: public void setBinaryAttributesNominal(boolean bool) {
351:
352: m_Numeric = !bool;
353: }
354:
355: /**
356: * Returns the tip text for this property
357: *
358: * @return tip text for this property suitable for
359: * displaying in the explorer/experimenter gui
360: */
361: public String transformAllValuesTipText() {
362: return "Whether all nominal values are turned into new attributes, not only if there are more than 2.";
363: }
364:
365: /**
366: * Gets if all nominal values are turned into new attributes, not only if
367: * there are more than 2.
368: *
369: * @return true all nominal values are transformed into new attributes
370: */
371: public boolean getTransformAllValues() {
372:
373: return m_TransformAll;
374: }
375:
376: /**
377: * Sets whether all nominal values are transformed into new attributes, not
378: * just if there are more than 2.
379: *
380: * @param bool true if all nominal value are transformed into new attributes
381: */
382: public void setTransformAllValues(boolean bool) {
383:
384: m_TransformAll = bool;
385: }
386:
387: /** Computes average class values for each attribute and value */
388: private void computeAverageClassValues() {
389:
390: double totalCounts, sum;
391: Instance instance;
392: double[] counts;
393:
394: double[][] avgClassValues = new double[getInputFormat()
395: .numAttributes()][0];
396: m_Indices = new int[getInputFormat().numAttributes()][0];
397: for (int j = 0; j < getInputFormat().numAttributes(); j++) {
398: Attribute att = getInputFormat().attribute(j);
399: if (att.isNominal()) {
400: avgClassValues[j] = new double[att.numValues()];
401: counts = new double[att.numValues()];
402: for (int i = 0; i < getInputFormat().numInstances(); i++) {
403: instance = getInputFormat().instance(i);
404: if (!instance.classIsMissing()
405: && (!instance.isMissing(j))) {
406: counts[(int) instance.value(j)] += instance
407: .weight();
408: avgClassValues[j][(int) instance.value(j)] += instance
409: .weight()
410: * instance.classValue();
411: }
412: }
413: sum = Utils.sum(avgClassValues[j]);
414: totalCounts = Utils.sum(counts);
415: if (Utils.gr(totalCounts, 0)) {
416: for (int k = 0; k < att.numValues(); k++) {
417: if (Utils.gr(counts[k], 0)) {
418: avgClassValues[j][k] /= (double) counts[k];
419: } else {
420: avgClassValues[j][k] = sum
421: / (double) totalCounts;
422: }
423: }
424: }
425: m_Indices[j] = Utils.sort(avgClassValues[j]);
426: }
427: }
428: }
429:
430: /** Set the output format. */
431: private void setOutputFormat() {
432:
433: if (getInputFormat().classAttribute().isNominal()) {
434: setOutputFormatNominal();
435: } else {
436: setOutputFormatNumeric();
437: }
438: }
439:
440: /**
441: * Convert a single instance over. The converted instance is
442: * added to the end of the output queue.
443: *
444: * @param instance the instance to convert
445: */
446: private void convertInstance(Instance inst) {
447:
448: if (getInputFormat().classAttribute().isNominal()) {
449: convertInstanceNominal(inst);
450: } else {
451: convertInstanceNumeric(inst);
452: }
453: }
454:
455: /**
456: * Set the output format if the class is nominal.
457: */
458: private void setOutputFormatNominal() {
459:
460: FastVector newAtts;
461: int newClassIndex;
462: StringBuffer attributeName;
463: Instances outputFormat;
464: FastVector vals;
465:
466: // Compute new attributes
467:
468: newClassIndex = getInputFormat().classIndex();
469: newAtts = new FastVector();
470: for (int j = 0; j < getInputFormat().numAttributes(); j++) {
471: Attribute att = getInputFormat().attribute(j);
472: if ((!att.isNominal())
473: || (j == getInputFormat().classIndex())) {
474: newAtts.addElement(att.copy());
475: } else {
476: if ((att.numValues() <= 2) && (!m_TransformAll)) {
477: if (m_Numeric) {
478: newAtts.addElement(new Attribute(att.name()));
479: } else {
480: newAtts.addElement(att.copy());
481: }
482: } else {
483:
484: if (j < getInputFormat().classIndex()) {
485: newClassIndex += att.numValues() - 1;
486: }
487:
488: // Compute values for new attributes
489: for (int k = 0; k < att.numValues(); k++) {
490: attributeName = new StringBuffer(att.name()
491: + "=");
492: attributeName.append(att.value(k));
493: if (m_Numeric) {
494: newAtts.addElement(new Attribute(
495: attributeName.toString()));
496: } else {
497: vals = new FastVector(2);
498: vals.addElement("f");
499: vals.addElement("t");
500: newAtts.addElement(new Attribute(
501: attributeName.toString(), vals));
502: }
503: }
504: }
505: }
506: }
507: outputFormat = new Instances(getInputFormat().relationName(),
508: newAtts, 0);
509: outputFormat.setClassIndex(newClassIndex);
510: setOutputFormat(outputFormat);
511: }
512:
513: /**
514: * Set the output format if the class is numeric.
515: */
516: private void setOutputFormatNumeric() {
517:
518: if (m_Indices == null) {
519: setOutputFormat(null);
520: return;
521: }
522: FastVector newAtts;
523: int newClassIndex;
524: StringBuffer attributeName;
525: Instances outputFormat;
526: FastVector vals;
527:
528: // Compute new attributes
529:
530: newClassIndex = getInputFormat().classIndex();
531: newAtts = new FastVector();
532: for (int j = 0; j < getInputFormat().numAttributes(); j++) {
533: Attribute att = getInputFormat().attribute(j);
534: if ((!att.isNominal())
535: || (j == getInputFormat().classIndex())) {
536: newAtts.addElement(att.copy());
537: } else {
538: if (j < getInputFormat().classIndex())
539: newClassIndex += att.numValues() - 2;
540:
541: // Compute values for new attributes
542:
543: for (int k = 1; k < att.numValues(); k++) {
544: attributeName = new StringBuffer(att.name() + "=");
545: for (int l = k; l < att.numValues(); l++) {
546: if (l > k) {
547: attributeName.append(',');
548: }
549: attributeName
550: .append(att.value(m_Indices[j][l]));
551: }
552: if (m_Numeric) {
553: newAtts.addElement(new Attribute(attributeName
554: .toString()));
555: } else {
556: vals = new FastVector(2);
557: vals.addElement("f");
558: vals.addElement("t");
559: newAtts.addElement(new Attribute(attributeName
560: .toString(), vals));
561: }
562: }
563: }
564: }
565: outputFormat = new Instances(getInputFormat().relationName(),
566: newAtts, 0);
567: outputFormat.setClassIndex(newClassIndex);
568: setOutputFormat(outputFormat);
569: }
570:
571: /**
572: * Convert a single instance over if the class is nominal. The converted
573: * instance is added to the end of the output queue.
574: *
575: * @param instance the instance to convert
576: */
577: private void convertInstanceNominal(Instance instance) {
578:
579: double[] vals = new double[outputFormatPeek().numAttributes()];
580: int attSoFar = 0;
581:
582: for (int j = 0; j < getInputFormat().numAttributes(); j++) {
583: Attribute att = getInputFormat().attribute(j);
584: if ((!att.isNominal())
585: || (j == getInputFormat().classIndex())) {
586: vals[attSoFar] = instance.value(j);
587: attSoFar++;
588: } else {
589: if ((att.numValues() <= 2) && (!m_TransformAll)) {
590: vals[attSoFar] = instance.value(j);
591: attSoFar++;
592: } else {
593: if (instance.isMissing(j)) {
594: for (int k = 0; k < att.numValues(); k++) {
595: vals[attSoFar + k] = instance.value(j);
596: }
597: } else {
598: for (int k = 0; k < att.numValues(); k++) {
599: if (k == (int) instance.value(j)) {
600: vals[attSoFar + k] = 1;
601: } else {
602: vals[attSoFar + k] = 0;
603: }
604: }
605: }
606: attSoFar += att.numValues();
607: }
608: }
609: }
610: Instance inst = null;
611: if (instance instanceof SparseInstance) {
612: inst = new SparseInstance(instance.weight(), vals);
613: } else {
614: inst = new Instance(instance.weight(), vals);
615: }
616: inst.setDataset(getOutputFormat());
617: copyValues(inst, false, instance.dataset(), getOutputFormat());
618: inst.setDataset(getOutputFormat());
619: push(inst);
620: }
621:
622: /**
623: * Convert a single instance over if the class is numeric. The converted
624: * instance is added to the end of the output queue.
625: *
626: * @param instance the instance to convert
627: */
628: private void convertInstanceNumeric(Instance instance) {
629:
630: double[] vals = new double[outputFormatPeek().numAttributes()];
631: int attSoFar = 0;
632:
633: for (int j = 0; j < getInputFormat().numAttributes(); j++) {
634: Attribute att = getInputFormat().attribute(j);
635: if ((!att.isNominal())
636: || (j == getInputFormat().classIndex())) {
637: vals[attSoFar] = instance.value(j);
638: attSoFar++;
639: } else {
640: if (instance.isMissing(j)) {
641: for (int k = 0; k < att.numValues() - 1; k++) {
642: vals[attSoFar + k] = instance.value(j);
643: }
644: } else {
645: int k = 0;
646: while ((int) instance.value(j) != m_Indices[j][k]) {
647: vals[attSoFar + k] = 1;
648: k++;
649: }
650: while (k < att.numValues() - 1) {
651: vals[attSoFar + k] = 0;
652: k++;
653: }
654: }
655: attSoFar += att.numValues() - 1;
656: }
657: }
658: Instance inst = null;
659: if (instance instanceof SparseInstance) {
660: inst = new SparseInstance(instance.weight(), vals);
661: } else {
662: inst = new Instance(instance.weight(), vals);
663: }
664: inst.setDataset(getOutputFormat());
665: copyValues(inst, false, instance.dataset(), getOutputFormat());
666: inst.setDataset(getOutputFormat());
667: push(inst);
668: }
669:
670: /**
671: * Main method for testing this class.
672: *
673: * @param argv should contain arguments to the filter:
674: * use -h for help
675: */
676: public static void main(String[] argv) {
677: runFilter(new NominalToBinary(), argv);
678: }
679: }
|