001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * ReplaceMissingValues.java
019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.filters.unsupervised.attribute;
024:
025: import weka.core.Capabilities;
026: import weka.core.Instance;
027: import weka.core.Instances;
028: import weka.core.SparseInstance;
029: import weka.core.Utils;
030: import weka.core.Capabilities.Capability;
031: import weka.filters.UnsupervisedFilter;
032:
033: /**
034: <!-- globalinfo-start -->
035: * Replaces all missing values for nominal and numeric attributes in a dataset with the modes and means from the training data.
036: * <p/>
037: <!-- globalinfo-end -->
038: *
039: <!-- options-start -->
040: * Valid options are: <p/>
041: *
042: * <pre> -unset-class-temporarily
043: * Unsets the class index temporarily before the filter is
044: * applied to the data.
045: * (default: no)</pre>
046: *
047: <!-- options-end -->
048: *
049: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
050: * @version $Revision: 1.8 $
051: */
052: public class ReplaceMissingValues extends PotentialClassIgnorer
053: implements UnsupervisedFilter {
054:
055: /** for serialization */
056: static final long serialVersionUID = 8349568310991609867L;
057:
058: /** The modes and means */
059: private double[] m_ModesAndMeans = null;
060:
061: /**
062: * Returns a string describing this filter
063: *
064: * @return a description of the filter suitable for
065: * displaying in the explorer/experimenter gui
066: */
067: public String globalInfo() {
068:
069: return "Replaces all missing values for nominal and numeric attributes in a "
070: + "dataset with the modes and means from the training data.";
071: }
072:
073: /**
074: * Returns the Capabilities of this filter.
075: *
076: * @return the capabilities of this object
077: * @see Capabilities
078: */
079: public Capabilities getCapabilities() {
080: Capabilities result = super .getCapabilities();
081:
082: // attributes
083: result.enableAllAttributes();
084: result.enable(Capability.MISSING_VALUES);
085:
086: // class
087: result.enableAllClasses();
088: result.enable(Capability.MISSING_CLASS_VALUES);
089: result.enable(Capability.NO_CLASS);
090:
091: return result;
092: }
093:
094: /**
095: * Sets the format of the input instances.
096: *
097: * @param instanceInfo an Instances object containing the input
098: * instance structure (any instances contained in the object are
099: * ignored - only the structure is required).
100: * @return true if the outputFormat may be collected immediately
101: * @throws Exception if the input format can't be set
102: * successfully
103: */
104: public boolean setInputFormat(Instances instanceInfo)
105: throws Exception {
106:
107: super .setInputFormat(instanceInfo);
108: setOutputFormat(instanceInfo);
109: m_ModesAndMeans = null;
110: return true;
111: }
112:
113: /**
114: * Input an instance for filtering. Filter requires all
115: * training instances be read before producing output.
116: *
117: * @param instance the input instance
118: * @return true if the filtered instance may now be
119: * collected with output().
120: * @throws IllegalStateException if no input format has been set.
121: */
122: public boolean input(Instance instance) {
123:
124: if (getInputFormat() == null) {
125: throw new IllegalStateException(
126: "No input instance format defined");
127: }
128: if (m_NewBatch) {
129: resetQueue();
130: m_NewBatch = false;
131: }
132: if (m_ModesAndMeans == null) {
133: bufferInput(instance);
134: return false;
135: } else {
136: convertInstance(instance);
137: return true;
138: }
139: }
140:
141: /**
142: * Signify that this batch of input to the filter is finished.
143: * If the filter requires all instances prior to filtering,
144: * output() may now be called to retrieve the filtered instances.
145: *
146: * @return true if there are instances pending output
147: * @throws IllegalStateException if no input structure has been defined
148: */
149: public boolean batchFinished() {
150:
151: if (getInputFormat() == null) {
152: throw new IllegalStateException(
153: "No input instance format defined");
154: }
155:
156: if (m_ModesAndMeans == null) {
157: // Compute modes and means
158: double sumOfWeights = getInputFormat().sumOfWeights();
159: double[][] counts = new double[getInputFormat()
160: .numAttributes()][];
161: for (int i = 0; i < getInputFormat().numAttributes(); i++) {
162: if (getInputFormat().attribute(i).isNominal()) {
163: counts[i] = new double[getInputFormat()
164: .attribute(i).numValues()];
165: counts[i][0] = sumOfWeights;
166: }
167: }
168: double[] sums = new double[getInputFormat().numAttributes()];
169: for (int i = 0; i < sums.length; i++) {
170: sums[i] = sumOfWeights;
171: }
172: double[] results = new double[getInputFormat()
173: .numAttributes()];
174: for (int j = 0; j < getInputFormat().numInstances(); j++) {
175: Instance inst = getInputFormat().instance(j);
176: for (int i = 0; i < inst.numValues(); i++) {
177: if (!inst.isMissingSparse(i)) {
178: double value = inst.valueSparse(i);
179: if (inst.attributeSparse(i).isNominal()) {
180: counts[inst.index(i)][(int) value] += inst
181: .weight();
182: counts[inst.index(i)][0] -= inst.weight();
183: } else if (inst.attributeSparse(i).isNumeric()) {
184: results[inst.index(i)] += inst.weight()
185: * inst.valueSparse(i);
186: }
187: } else {
188: if (inst.attributeSparse(i).isNominal()) {
189: counts[inst.index(i)][0] -= inst.weight();
190: } else if (inst.attributeSparse(i).isNumeric()) {
191: sums[inst.index(i)] -= inst.weight();
192: }
193: }
194: }
195: }
196: m_ModesAndMeans = new double[getInputFormat()
197: .numAttributes()];
198: for (int i = 0; i < getInputFormat().numAttributes(); i++) {
199: if (getInputFormat().attribute(i).isNominal()) {
200: m_ModesAndMeans[i] = (double) Utils
201: .maxIndex(counts[i]);
202: } else if (getInputFormat().attribute(i).isNumeric()) {
203: if (Utils.gr(sums[i], 0)) {
204: m_ModesAndMeans[i] = results[i] / sums[i];
205: }
206: }
207: }
208:
209: // Convert pending input instances
210: for (int i = 0; i < getInputFormat().numInstances(); i++) {
211: convertInstance(getInputFormat().instance(i));
212: }
213: }
214: // Free memory
215: flushInput();
216:
217: m_NewBatch = true;
218: return (numPendingOutput() != 0);
219: }
220:
221: /**
222: * Convert a single instance over. The converted instance is
223: * added to the end of the output queue.
224: *
225: * @param instance the instance to convert
226: */
227: private void convertInstance(Instance instance) {
228:
229: Instance inst = null;
230: if (instance instanceof SparseInstance) {
231: double[] vals = new double[instance.numValues()];
232: int[] indices = new int[instance.numValues()];
233: int num = 0;
234: for (int j = 0; j < instance.numValues(); j++) {
235: if (instance.isMissingSparse(j)
236: && (getInputFormat().classIndex() != instance
237: .index(j))
238: && (instance.attributeSparse(j).isNominal() || instance
239: .attributeSparse(j).isNumeric())) {
240: if (m_ModesAndMeans[instance.index(j)] != 0.0) {
241: vals[num] = m_ModesAndMeans[instance.index(j)];
242: indices[num] = instance.index(j);
243: num++;
244: }
245: } else {
246: vals[num] = instance.valueSparse(j);
247: indices[num] = instance.index(j);
248: num++;
249: }
250: }
251: if (num == instance.numValues()) {
252: inst = new SparseInstance(instance.weight(), vals,
253: indices, instance.numAttributes());
254: } else {
255: double[] tempVals = new double[num];
256: int[] tempInd = new int[num];
257: System.arraycopy(vals, 0, tempVals, 0, num);
258: System.arraycopy(indices, 0, tempInd, 0, num);
259: inst = new SparseInstance(instance.weight(), tempVals,
260: tempInd, instance.numAttributes());
261: }
262: } else {
263: double[] vals = new double[getInputFormat().numAttributes()];
264: for (int j = 0; j < instance.numAttributes(); j++) {
265: if (instance.isMissing(j)
266: && (getInputFormat().classIndex() != j)
267: && (getInputFormat().attribute(j).isNominal() || getInputFormat()
268: .attribute(j).isNumeric())) {
269: vals[j] = m_ModesAndMeans[j];
270: } else {
271: vals[j] = instance.value(j);
272: }
273: }
274: inst = new Instance(instance.weight(), vals);
275: }
276: inst.setDataset(instance.dataset());
277: push(inst);
278: }
279:
280: /**
281: * Main method for testing this class.
282: *
283: * @param argv should contain arguments to the filter:
284: * use -h for help
285: */
286: public static void main(String[] argv) {
287: runFilter(new ReplaceMissingValues(), argv);
288: }
289: }
|