001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * AddNoise.java
019: * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
020: */
021:
022: package weka.filters.unsupervised.attribute;
023:
024: import weka.core.Capabilities;
025: import weka.core.Instance;
026: import weka.core.Instances;
027: import weka.core.Option;
028: import weka.core.OptionHandler;
029: import weka.core.SingleIndex;
030: import weka.core.Utils;
031: import weka.core.Capabilities.Capability;
032: import weka.filters.Filter;
033: import weka.filters.UnsupervisedFilter;
034:
035: import java.util.Enumeration;
036: import java.util.Random;
037: import java.util.Vector;
038:
039: /**
040: <!-- globalinfo-start -->
041: * An instance filter that changes a percentage of a given attributes values. The attribute must be nominal. Missing value can be treated as value itself.
042: * <p/>
043: <!-- globalinfo-end -->
044: *
045: <!-- options-start -->
046: * Valid options are: <p/>
047: *
048: * <pre> -C <col>
049: * Index of the attribute to be changed
050: * (default last attribute)</pre>
051: *
052: * <pre> -M
053: * Treat missing values as an extra value
054: * </pre>
055: *
056: * <pre> -P <num>
057: * Specify the percentage of noise introduced
058: * to the data (default 10)</pre>
059: *
060: * <pre> -S <num>
061: * Specify the random number seed (default 1)</pre>
062: *
063: <!-- options-end -->
064: *
065: * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
066: * @version $Revision: 1.8 $
067: */
068: public class AddNoise extends Filter implements UnsupervisedFilter,
069: OptionHandler {
070:
071: /** for serialization */
072: static final long serialVersionUID = -8499673222857299082L;
073:
074: /** The attribute's index setting. */
075: private SingleIndex m_AttIndex = new SingleIndex("last");
076:
077: /** Flag if missing values are taken as value. */
078: private boolean m_UseMissing = false;
079:
080: /** The subsample size, percent of original set, default 10% */
081: private int m_Percent = 10;
082:
083: /** The random number generator seed */
084: private int m_RandomSeed = 1;
085:
086: /**
087: * Returns a string describing this filter
088: *
089: * @return a description of the filter suitable for
090: * displaying in the explorer/experimenter gui
091: */
092: public String globalInfo() {
093:
094: return "An instance filter that changes a percentage of a given"
095: + " attributes values. The attribute must be nominal."
096: + " Missing value can be treated as value itself.";
097: }
098:
099: /**
100: * Returns an enumeration describing the available options
101: *
102: * @return an enumeration of all the available options
103: */
104: public Enumeration listOptions() {
105:
106: Vector newVector = new Vector(4);
107:
108: newVector.addElement(new Option(
109: "\tIndex of the attribute to be changed \n"
110: + "\t(default last attribute)", "C", 1,
111: "-C <col>"));
112: newVector.addElement(new Option(
113: "\tTreat missing values as an extra value \n", "M", 1,
114: "-M"));
115: newVector.addElement(new Option(
116: "\tSpecify the percentage of noise introduced \n"
117: + "\tto the data (default 10)", "P", 1,
118: "-P <num>"));
119: newVector.addElement(new Option(
120: "\tSpecify the random number seed (default 1)", "S", 1,
121: "-S <num>"));
122:
123: return newVector.elements();
124: }
125:
126: /**
127: * Parses a given list of options. <p/>
128: *
129: <!-- options-start -->
130: * Valid options are: <p/>
131: *
132: * <pre> -C <col>
133: * Index of the attribute to be changed
134: * (default last attribute)</pre>
135: *
136: * <pre> -M
137: * Treat missing values as an extra value
138: * </pre>
139: *
140: * <pre> -P <num>
141: * Specify the percentage of noise introduced
142: * to the data (default 10)</pre>
143: *
144: * <pre> -S <num>
145: * Specify the random number seed (default 1)</pre>
146: *
147: <!-- options-end -->
148: *
149: * @param options the list of options as an array of strings
150: * @throws Exception if an option is not supported
151: */
152: public void setOptions(String[] options) throws Exception {
153:
154: String indexString = Utils.getOption('C', options);
155: if (indexString.length() != 0) {
156: setAttributeIndex(indexString);
157: } else {
158: setAttributeIndex("last");
159: }
160:
161: if (Utils.getFlag('M', options)) {
162: setUseMissing(true);
163: }
164:
165: String percentString = Utils.getOption('P', options);
166: if (percentString.length() != 0) {
167: setPercent((int) Double.valueOf(percentString)
168: .doubleValue());
169: } else {
170: setPercent(10);
171: }
172:
173: String seedString = Utils.getOption('S', options);
174: if (seedString.length() != 0) {
175: setRandomSeed(Integer.parseInt(seedString));
176: } else {
177: setRandomSeed(1);
178: }
179:
180: }
181:
182: /**
183: * Gets the current settings of the filter.
184: *
185: * @return an array of strings suitable for passing to setOptions
186: */
187: public String[] getOptions() {
188:
189: String[] options = new String[7];
190: int current = 0;
191:
192: options[current++] = "-C";
193: options[current++] = "" + getAttributeIndex();
194:
195: if (getUseMissing()) {
196: options[current++] = "-M";
197: }
198:
199: options[current++] = "-P";
200: options[current++] = "" + getPercent();
201:
202: options[current++] = "-S";
203: options[current++] = "" + getRandomSeed();
204:
205: while (current < options.length) {
206: options[current++] = "";
207: }
208: return options;
209: }
210:
211: /**
212: * Returns the tip text for this property
213: *
214: * @return tip text for this property suitable for
215: * displaying in the explorer/experimenter gui
216: */
217: public String useMissingTipText() {
218:
219: return "Flag to set if missing values are used.";
220: }
221:
222: /**
223: * Gets the flag if missing values are treated as extra values.
224: *
225: * @return the flag missing values.
226: */
227: public boolean getUseMissing() {
228:
229: return m_UseMissing;
230: }
231:
232: /**
233: * Sets the flag if missing values are treated as extra values.
234: *
235: * @param newUseMissing the new flag value.
236: */
237: public void setUseMissing(boolean newUseMissing) {
238:
239: m_UseMissing = newUseMissing;
240: }
241:
242: /**
243: * Returns the tip text for this property
244: *
245: * @return tip text for this property suitable for
246: * displaying in the explorer/experimenter gui
247: */
248: public String randomSeedTipText() {
249:
250: return "Random number seed.";
251: }
252:
253: /**
254: * Gets the random number seed.
255: *
256: * @return the random number seed.
257: */
258: public int getRandomSeed() {
259:
260: return m_RandomSeed;
261: }
262:
263: /**
264: * Sets the random number seed.
265: *
266: * @param newSeed the new random number seed.
267: */
268: public void setRandomSeed(int newSeed) {
269:
270: m_RandomSeed = newSeed;
271: }
272:
273: /**
274: * Returns the tip text for this property
275: *
276: * @return tip text for this property suitable for
277: * displaying in the explorer/experimenter gui
278: */
279: public String percentTipText() {
280:
281: return "Percentage of introduced noise to data.";
282: }
283:
284: /**
285: * Gets the size of noise data as a percentage of the original set.
286: *
287: * @return the noise data size
288: */
289: public int getPercent() {
290:
291: return m_Percent;
292: }
293:
294: /**
295: * Sets the size of noise data, as a percentage of the original set.
296: *
297: * @param newPercent the subsample set size, between 0 and 100.
298: */
299: public void setPercent(int newPercent) {
300:
301: m_Percent = newPercent;
302: }
303:
304: /**
305: * Returns the tip text for this property
306: *
307: * @return tip text for this property suitable for
308: * displaying in the explorer/experimenter gui
309: */
310: public String attributeIndexTipText() {
311:
312: return "Index of the attribute that is to changed.";
313: }
314:
315: /**
316: * Get the index of the attribute used.
317: *
318: * @return the index of the attribute
319: */
320: public String getAttributeIndex() {
321:
322: return m_AttIndex.getSingleIndex();
323: }
324:
325: /**
326: * Sets index of the attribute used.
327: *
328: * @param attIndex the index of the attribute
329: */
330: public void setAttributeIndex(String attIndex) {
331:
332: m_AttIndex.setSingleIndex(attIndex);
333: }
334:
335: /**
336: * Returns the Capabilities of this filter.
337: *
338: * @return the capabilities of this object
339: * @see Capabilities
340: */
341: public Capabilities getCapabilities() {
342: Capabilities result = super .getCapabilities();
343:
344: // attributes
345: result.enableAllAttributes();
346: result.enable(Capability.MISSING_VALUES);
347:
348: // class
349: result.enableAllClasses();
350: result.enable(Capability.MISSING_CLASS_VALUES);
351: result.enable(Capability.NO_CLASS);
352:
353: return result;
354: }
355:
356: /**
357: * Sets the format of the input instances.
358: *
359: * @param instanceInfo an Instances object containing the input
360: * instance structure (any instances contained in the object are
361: * ignored - only the structure is required).
362: * @return true if the outputFormat may be collected immediately
363: * @throws Exception if the input format can't be set
364: * successfully
365: */
366: public boolean setInputFormat(Instances instanceInfo)
367: throws Exception {
368:
369: super .setInputFormat(instanceInfo);
370: // set input format
371: //m_InputFormat = new Instances(instanceInfo, 0);
372: m_AttIndex.setUpper(getInputFormat().numAttributes() - 1);
373: // set index of attribute to be changed
374:
375: // test if nominal
376: if (!getInputFormat().attribute(m_AttIndex.getIndex())
377: .isNominal()) {
378: throw new Exception("Adding noise is not possible:"
379: + "Chosen attribute is numeric.");
380: }
381:
382: // test if two values are given
383: if ((getInputFormat().attribute(m_AttIndex.getIndex())
384: .numValues() < 2)
385: && (!m_UseMissing)) {
386: throw new Exception("Adding noise is not possible:"
387: + "Chosen attribute has less than two values.");
388: }
389:
390: setOutputFormat(getInputFormat());
391: m_NewBatch = true;
392: return false;
393: }
394:
395: /**
396: * Input an instance for filtering.
397: *
398: * @param instance the input instance
399: * @return true if the filtered instance may now be
400: * collected with output().
401: * @throws Exception if the input format was not set
402: */
403: public boolean input(Instance instance) throws Exception {
404:
405: // check if input format is defined
406: if (getInputFormat() == null) {
407: throw new Exception("No input instance format defined");
408: }
409:
410: if (m_NewBatch) {
411: resetQueue();
412: m_NewBatch = false;
413: }
414:
415: if (isFirstBatchDone()) {
416: push(instance);
417: return true;
418: } else {
419: bufferInput(instance);
420: return false;
421: }
422: }
423:
424: /**
425: * Signify that this batch of input to the filter is finished.
426: * If the filter requires all instances prior to filtering,
427: * output() may now be called to retrieve the filtered instances.
428: *
429: * @return true if there are instances pending output
430: * @throws Exception if no input structure has been defined
431: */
432: public boolean batchFinished() throws Exception {
433:
434: if (getInputFormat() == null) {
435: throw new Exception("No input instance format defined");
436: }
437:
438: // Do the subsample, and clear the input instances.
439: addNoise(getInputFormat(), m_RandomSeed, m_Percent, m_AttIndex
440: .getIndex(), m_UseMissing);
441:
442: for (int i = 0; i < getInputFormat().numInstances(); i++) {
443: push((Instance) getInputFormat().instance(i).copy());
444: }
445:
446: flushInput();
447:
448: m_NewBatch = true;
449: m_FirstBatchDone = true;
450: return (numPendingOutput() != 0);
451: }
452:
453: /**
454: * add noise to the dataset
455: *
456: * a given percentage of the instances are changed in the way, that
457: * a set of instances are randomly selected using seed. The attribute
458: * given by its index is changed from its current value to one of the
459: * other possibly ones, also randomly. This is done with leaving the
460: * apportion the same.
461: * if m_UseMissing is true, missing value is used as a value of its own
462: * @param instances is the dataset
463: * @param seed used for random function
464: * @param percent percentage of instances that are changed
465: * @param attIndex index of the attribute changed
466: * @param useMissing if true missing values are treated as extra value
467: */
468: public void addNoise(Instances instances, int seed, int percent,
469: int attIndex, boolean useMissing) {
470: int indexList[];
471: int partition_count[];
472: int partition_max[];
473: double splitPercent = (double) percent; // percentage used for splits
474:
475: // fill array with the indexes
476: indexList = new int[instances.numInstances()];
477: for (int i = 0; i < instances.numInstances(); i++) {
478: indexList[i] = i;
479: }
480:
481: // randomize list of indexes
482: Random random = new Random(seed);
483: for (int i = instances.numInstances() - 1; i >= 0; i--) {
484: int hValue = indexList[i];
485: int hIndex = (int) (random.nextDouble() * (double) i);
486: indexList[i] = indexList[hIndex];
487: indexList[hIndex] = hValue;
488: }
489:
490: // initialize arrays that are used to count instances
491: // of each value and to keep the amount of instances of that value
492: // that has to be changed
493: // this is done for the missing values in the two variables
494: // missing_count and missing_max
495: int numValues = instances.attribute(attIndex).numValues();
496:
497: partition_count = new int[numValues];
498: partition_max = new int[numValues];
499: int missing_count = 0;
500: ;
501: int missing_max = 0;
502: ;
503:
504: for (int i = 0; i < numValues; i++) {
505: partition_count[i] = 0;
506: partition_max[i] = 0;
507: }
508:
509: // go through the dataset and count all occurrences of values
510: // and all missing values using temporarily .._max arrays and
511: // variable missing_max
512: for (Enumeration e = instances.enumerateInstances(); e
513: .hasMoreElements();) {
514: Instance instance = (Instance) e.nextElement();
515: if (instance.isMissing(attIndex)) {
516: missing_max++;
517: } else {
518: int j = (int) instance.value(attIndex);
519: partition_max[(int) instance.value(attIndex)]++;
520: }
521: }
522:
523: // use given percentage to calculate
524: // how many have to be changed per split and
525: // how many of the missing values
526: if (!useMissing) {
527: missing_max = missing_count;
528: } else {
529: missing_max = (int) (((double) missing_max / 100)
530: * splitPercent + 0.5);
531: }
532: int sum_max = missing_max;
533: for (int i = 0; i < numValues; i++) {
534: partition_max[i] = (int) (((double) partition_max[i] / 100)
535: * splitPercent + 0.5);
536: sum_max = sum_max + partition_max[i];
537: }
538:
539: // initialize sum_count to zero, use this variable to see if
540: // everything is done already
541: int sum_count = 0;
542:
543: // add noise
544: // using the randomized index-array
545: //
546: Random randomValue = new Random(seed);
547: int numOfValues = instances.attribute(attIndex).numValues();
548: for (int i = 0; i < instances.numInstances(); i++) {
549: if (sum_count >= sum_max) {
550: break;
551: } // finished
552: Instance currInstance = instances.instance(indexList[i]);
553: // if value is missing then...
554: if (currInstance.isMissing(attIndex)) {
555: if (missing_count < missing_max) {
556: changeValueRandomly(randomValue, numOfValues,
557: attIndex, currInstance, useMissing);
558: missing_count++;
559: sum_count++;
560: }
561:
562: } else {
563: int vIndex = (int) currInstance.value(attIndex);
564: if (partition_count[vIndex] < partition_max[vIndex]) {
565: changeValueRandomly(randomValue, numOfValues,
566: attIndex, currInstance, useMissing);
567: partition_count[vIndex]++;
568: sum_count++;
569: }
570: }
571: }
572:
573: }
574:
575: /**
576: * method to set a new value
577: *
578: * @param r random function
579: * @param numOfValues
580: * @param instance
581: * @param useMissing
582: */
583: private void changeValueRandomly(Random r, int numOfValues,
584: int indexOfAtt, Instance instance, boolean useMissing) {
585: int currValue;
586:
587: // get current value
588: // if value is missing set current value to number of values
589: // whiche is the highest possible value plus one
590: if (instance.isMissing(indexOfAtt)) {
591: currValue = numOfValues;
592: } else {
593: currValue = (int) instance.value(indexOfAtt);
594: }
595:
596: // with only two possible values it is easier
597: if ((numOfValues == 2) && (!instance.isMissing(indexOfAtt))) {
598: instance.setValue(indexOfAtt,
599: (double) ((currValue + 1) % 2));
600: } else {
601: // get randomly a new value not equal to the current value
602: // if missing values are used as values they must be treated
603: // in a special way
604: while (true) {
605: int newValue;
606: if (useMissing) {
607: newValue = (int) (r.nextDouble() * (double) (numOfValues + 1));
608: } else {
609: newValue = (int) (r.nextDouble() * (double) numOfValues);
610: }
611: // have we found a new value?
612: if (newValue != currValue) {
613: // the value 1 above the highest possible value (=numOfValues)
614: // is used as missing value
615: if (newValue == numOfValues) {
616: instance.setMissing(indexOfAtt);
617: } else {
618: instance
619: .setValue(indexOfAtt, (double) newValue);
620: }
621: break;
622: }
623: }
624: }
625: }
626:
627: /**
628: * Main method for testing this class.
629: *
630: * @param argv should contain arguments to the filter:
631: * use -h for help
632: */
633: public static void main(String[] argv) {
634: runFilter(new AddNoise(), argv);
635: }
636: }
|