001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * RandomProjection.java
019: * Copyright (C) 2003 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.filters.unsupervised.attribute;
024:
025: import weka.core.Attribute;
026: import weka.core.Capabilities;
027: import weka.core.FastVector;
028: import weka.core.Instance;
029: import weka.core.Instances;
030: import weka.core.Option;
031: import weka.core.OptionHandler;
032: import weka.core.SelectedTag;
033: import weka.core.Tag;
034: import weka.core.TechnicalInformation;
035: import weka.core.TechnicalInformationHandler;
036: import weka.core.Utils;
037: import weka.core.Capabilities.Capability;
038: import weka.core.TechnicalInformation.Field;
039: import weka.core.TechnicalInformation.Type;
040: import weka.filters.Filter;
041: import weka.filters.UnsupervisedFilter;
042:
043: import java.util.Enumeration;
044: import java.util.Random;
045: import java.util.Vector;
046:
047: /**
048: <!-- globalinfo-start -->
049: * Reduces the dimensionality of the data by projecting it onto a lower dimensional subspace using a random matrix with columns of unit length (i.e. It will reduce the number of attributes in the data while preserving much of its variation like PCA, but at a much less computational cost).<br/>
050: * It first applies the NominalToBinary filter to convert all attributes to numeric before reducing the dimension. It preserves the class attribute.<br/>
051: * <br/>
052: * For more information, see:<br/>
053: * <br/>
054: * Dmitriy Fradkin, David Madigan: Experiments with random projections for machine learning. In: KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining, New York, NY, USA, 517-522, 003.
055: * <p/>
056: <!-- globalinfo-end -->
057: *
058: <!-- technical-bibtex-start -->
059: * BibTeX:
060: * <pre>
061: * @inproceedings{Fradkin003,
062: * address = {New York, NY, USA},
063: * author = {Dmitriy Fradkin and David Madigan},
064: * booktitle = {KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining},
065: * pages = {517-522},
066: * publisher = {ACM Press},
067: * title = {Experiments with random projections for machine learning},
068: * year = {003}
069: * }
070: * </pre>
071: * <p/>
072: <!-- technical-bibtex-end -->
073: *
074: <!-- options-start -->
075: * Valid options are: <p/>
076: *
077: * <pre> -N <number>
078: * The number of dimensions (attributes) the data should be reduced to
079: * (default 10; exclusive of the class attribute, if it is set).</pre>
080: *
081: * <pre> -D [SPARSE1|SPARSE2|GAUSSIAN]
082: * The distribution to use for calculating the random matrix.
083: * Sparse1 is:
084: * sqrt(3)*{-1 with prob(1/6), 0 with prob(2/3), +1 with prob(1/6)}
085: * Sparse2 is:
086: * {-1 with prob(1/2), +1 with prob(1/2)}
087: * </pre>
088: *
089: * <pre> -P <percent>
090: * The percentage of dimensions (attributes) the data should
091: * be reduced to (exclusive of the class attribute, if it is set). This -N
092: * option is ignored if this option is present or is greater
093: * than zero.</pre>
094: *
095: * <pre> -M
096: * Replace missing values using the ReplaceMissingValues filter</pre>
097: *
098: * <pre> -R <num>
099: * The random seed for the random number generator used for
100: * calculating the random matrix (default 42).</pre>
101: *
102: <!-- options-end -->
103: *
104: * @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz)
105: * @version $Revision: 1.10 $ [1.0 - 22 July 2003 - Initial version (Ashraf M. Kibriya)]
106: */
107: public class RandomProjection extends Filter implements
108: UnsupervisedFilter, OptionHandler, TechnicalInformationHandler {
109:
110: /** for serialization */
111: static final long serialVersionUID = 4428905532728645880L;
112:
113: /** Stores the number of dimensions to reduce the data to */
114: protected int m_k = 10;
115:
116: /** Stores the dimensionality the data should be reduced to as percentage of the original dimension */
117: protected double m_percent = 0.0;
118:
119: /** Is the random matrix will be computed using
120: Gaussian distribution or not */
121: protected boolean m_useGaussian = false;
122:
123: /** distribution type: sparse 1 */
124: public static final int SPARSE1 = 1;
125: /** distribution type: sparse 2 */
126: public static final int SPARSE2 = 2;
127: /** distribution type: gaussian */
128: public static final int GAUSSIAN = 3;
129:
130: /** The types of distributions that can be used for
131: calculating the random matrix */
132: public static final Tag[] TAGS_DSTRS_TYPE = {
133: new Tag(SPARSE1, "Sparse 1"), new Tag(SPARSE2, "Sparse 2"),
134: new Tag(GAUSSIAN, "Gaussian"), };
135:
136: /** Stores the distribution to use for calculating the
137: random matrix */
138: protected int m_distribution = SPARSE1;
139:
140: /** Should the missing values be replaced using
141: unsupervised.ReplaceMissingValues filter */
142: protected boolean m_useReplaceMissing = false;
143:
144: /** Keeps track of output format if it is defined or not */
145: protected boolean m_OutputFormatDefined = false;
146:
147: /** The NominalToBinary filter applied to the data before this filter */
148: protected Filter m_ntob; // = new weka.filters.unsupervised.attribute.NominalToBinary();
149:
150: /** The ReplaceMissingValues filter */
151: protected Filter m_replaceMissing;
152:
153: /** Stores the random seed used to generate the random matrix */
154: protected long m_rndmSeed = 42;
155:
156: /** The random matrix */
157: protected double m_rmatrix[][];
158:
159: /** The random number generator used for generating the random matrix */
160: protected Random m_random;
161:
162: /**
163: * Returns an enumeration describing the available options.
164: *
165: * @return an enumeration of all the available options.
166: */
167: public Enumeration listOptions() {
168:
169: Vector newVector = new Vector(2);
170:
171: newVector
172: .addElement(new Option(
173: "\tThe number of dimensions (attributes) the data should be reduced to\n"
174: + "\t(default 10; exclusive of the class attribute, if it is set).",
175: "N", 1, "-N <number>"));
176:
177: newVector
178: .addElement(new Option(
179: "\tThe distribution to use for calculating the random matrix.\n"
180: + "\tSparse1 is:\n"
181: + "\t sqrt(3)*{-1 with prob(1/6), 0 with prob(2/3), +1 with prob(1/6)}\n"
182: + "\tSparse2 is:\n"
183: + "\t {-1 with prob(1/2), +1 with prob(1/2)}\n",
184: "D", 1, "-D [SPARSE1|SPARSE2|GAUSSIAN]"));
185:
186: //newVector.addElement(new Option(
187: // "\tUse Gaussian distribution for calculating the random matrix.",
188: // "G", 0, "-G"));
189:
190: newVector
191: .addElement(new Option(
192: "\tThe percentage of dimensions (attributes) the data should\n"
193: + "\tbe reduced to (exclusive of the class attribute, if it is set). This -N\n"
194: + "\toption is ignored if this option is present or is greater\n"
195: + "\tthan zero.", "P", 1,
196: "-P <percent>"));
197:
198: newVector
199: .addElement(new Option(
200: "\tReplace missing values using the ReplaceMissingValues filter",
201: "M", 0, "-M"));
202:
203: newVector
204: .addElement(new Option(
205: "\tThe random seed for the random number generator used for\n"
206: + "\tcalculating the random matrix (default 42).",
207: "R", 0, "-R <num>"));
208:
209: return newVector.elements();
210: }
211:
212: /**
213: * Parses a given list of options. <p/>
214: *
215: <!-- options-start -->
216: * Valid options are: <p/>
217: *
218: * <pre> -N <number>
219: * The number of dimensions (attributes) the data should be reduced to
220: * (default 10; exclusive of the class attribute, if it is set).</pre>
221: *
222: * <pre> -D [SPARSE1|SPARSE2|GAUSSIAN]
223: * The distribution to use for calculating the random matrix.
224: * Sparse1 is:
225: * sqrt(3)*{-1 with prob(1/6), 0 with prob(2/3), +1 with prob(1/6)}
226: * Sparse2 is:
227: * {-1 with prob(1/2), +1 with prob(1/2)}
228: * </pre>
229: *
230: * <pre> -P <percent>
231: * The percentage of dimensions (attributes) the data should
232: * be reduced to (exclusive of the class attribute, if it is set). This -N
233: * option is ignored if this option is present or is greater
234: * than zero.</pre>
235: *
236: * <pre> -M
237: * Replace missing values using the ReplaceMissingValues filter</pre>
238: *
239: * <pre> -R <num>
240: * The random seed for the random number generator used for
241: * calculating the random matrix (default 42).</pre>
242: *
243: <!-- options-end -->
244: *
245: * @param options the list of options as an array of strings
246: * @throws Exception if an option is not supported
247: */
248: public void setOptions(String[] options) throws Exception {
249:
250: String mString = Utils.getOption('P', options);
251: if (mString.length() != 0) {
252: setPercent((double) Double.parseDouble(mString)); //setNumberOfAttributes((int) Integer.parseInt(mString));
253: } else {
254: setPercent(0);
255: mString = Utils.getOption('N', options);
256: if (mString.length() != 0)
257: setNumberOfAttributes(Integer.parseInt(mString));
258: else
259: setNumberOfAttributes(10);
260: }
261:
262: mString = Utils.getOption('R', options);
263: if (mString.length() != 0) {
264: setRandomSeed(Long.parseLong(mString));
265: }
266:
267: mString = Utils.getOption('D', options);
268: if (mString.length() != 0) {
269: if (mString.equalsIgnoreCase("sparse1"))
270: setDistribution(new SelectedTag(SPARSE1,
271: TAGS_DSTRS_TYPE));
272: else if (mString.equalsIgnoreCase("sparse2"))
273: setDistribution(new SelectedTag(SPARSE2,
274: TAGS_DSTRS_TYPE));
275: else if (mString.equalsIgnoreCase("gaussian"))
276: setDistribution(new SelectedTag(GAUSSIAN,
277: TAGS_DSTRS_TYPE));
278: }
279:
280: if (Utils.getFlag('M', options))
281: setReplaceMissingValues(true);
282: else
283: setReplaceMissingValues(false);
284:
285: //if(Utils.getFlag('G', options))
286: // setUseGaussian(true);
287: //else
288: // setUseGaussian(false);
289:
290: }
291:
292: /**
293: * Gets the current settings of the filter.
294: *
295: * @return an array of strings suitable for passing to setOptions
296: */
297: public String[] getOptions() {
298:
299: String[] options = new String[10];
300: int current = 0;
301:
302: //if (getUseGaussian()) {
303: // options[current++] = "-G";
304: //}
305:
306: if (getReplaceMissingValues()) {
307: options[current++] = "-M";
308: }
309:
310: if (getPercent() == 0) {
311: options[current++] = "-N";
312: options[current++] = "" + getNumberOfAttributes();
313: } else {
314: options[current++] = "-P";
315: options[current++] = "" + getPercent();
316: }
317:
318: options[current++] = "-R";
319: options[current++] = "" + getRandomSeed();
320:
321: SelectedTag t = getDistribution();
322: options[current++] = "-D";
323: options[current++] = "" + t.getSelectedTag().getReadable();
324:
325: while (current < options.length) {
326: options[current++] = "";
327: }
328:
329: return options;
330: }
331:
332: /**
333: * Returns a string describing this filter
334: *
335: * @return a description of the filter suitable for
336: * displaying in the explorer/experimenter gui
337: */
338: public String globalInfo() {
339:
340: return "Reduces the dimensionality of the data by projecting"
341: + " it onto a lower dimensional subspace using a random"
342: + " matrix with columns of unit length (i.e. It will reduce"
343: + " the number of attributes in the data while preserving"
344: + " much of its variation like PCA, but at a much less"
345: + " computational cost).\n"
346: + "It first applies the NominalToBinary filter to"
347: + " convert all attributes to numeric before reducing the"
348: + " dimension. It preserves the class attribute.\n\n"
349: + "For more information, see:\n\n"
350: + getTechnicalInformation().toString();
351: }
352:
353: /**
354: * Returns an instance of a TechnicalInformation object, containing
355: * detailed information about the technical background of this class,
356: * e.g., paper reference or book this class is based on.
357: *
358: * @return the technical information about this class
359: */
360: public TechnicalInformation getTechnicalInformation() {
361: TechnicalInformation result;
362:
363: result = new TechnicalInformation(Type.INPROCEEDINGS);
364: result.setValue(Field.AUTHOR,
365: "Dmitriy Fradkin and David Madigan");
366: result
367: .setValue(Field.TITLE,
368: "Experiments with random projections for machine learning");
369: result
370: .setValue(
371: Field.BOOKTITLE,
372: "KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining");
373: result.setValue(Field.YEAR, "003");
374: result.setValue(Field.PAGES, "517-522");
375: result.setValue(Field.PUBLISHER, "ACM Press");
376: result.setValue(Field.ADDRESS, "New York, NY, USA");
377:
378: return result;
379: }
380:
381: /**
382: * Returns the tip text for this property
383: *
384: * @return tip text for this property suitable for
385: * displaying in the explorer/experimenter gui
386: */
387: public String numberOfAttributesTipText() {
388:
389: return "The number of dimensions (attributes) the data should"
390: + " be reduced to.";
391: }
392:
393: /**
394: * Sets the number of attributes (dimensions) the data should be reduced to
395: *
396: * @param newAttNum the goal for the dimensions
397: */
398: public void setNumberOfAttributes(int newAttNum) {
399: m_k = newAttNum;
400: }
401:
402: /**
403: * Gets the current number of attributes (dimensionality) to which the data
404: * will be reduced to.
405: *
406: * @return the number of dimensions
407: */
408: public int getNumberOfAttributes() {
409: return m_k;
410: }
411:
412: /**
413: * Returns the tip text for this property
414: *
415: * @return tip text for this property suitable for
416: * displaying in the explorer/experimenter gui
417: */
418: public String percentTipText() {
419:
420: return " The percentage of dimensions (attributes) the data should"
421: + " be reduced to (inclusive of the class attribute). This "
422: + " NumberOfAttributes option is ignored if this option is"
423: + " present or is greater than zero.";
424: }
425:
426: /**
427: * Sets the percent the attributes (dimensions) of the data should be reduced to
428: *
429: * @param newPercent the percentage of attributes
430: */
431: public void setPercent(double newPercent) {
432: if (newPercent > 0)
433: newPercent /= 100;
434: m_percent = newPercent;
435: }
436:
437: /**
438: * Gets the percent the attributes (dimensions) of the data will be reduced to
439: *
440: * @return the percentage of attributes
441: */
442: public double getPercent() {
443: return m_percent * 100;
444: }
445:
446: /**
447: * Returns the tip text for this property
448: *
449: * @return tip text for this property suitable for
450: * displaying in the explorer/experimenter gui
451: */
452: public String randomSeedTipText() {
453: return "The random seed used by the random"
454: + " number generator used for generating"
455: + " the random matrix ";
456: }
457:
458: /**
459: * Sets the random seed of the random number generator
460: *
461: * @param seed the random seed value
462: */
463: public void setRandomSeed(long seed) {
464: m_rndmSeed = seed;
465: }
466:
467: /**
468: * Gets the random seed of the random number generator
469: *
470: * @return the random seed value
471: */
472: public long getRandomSeed() {
473: return m_rndmSeed;
474: }
475:
476: /**
477: * Returns the tip text for this property
478: *
479: * @return tip text for this property suitable for
480: * displaying in the explorer/experimenter gui
481: */
482: public String distributionTipText() {
483: return "The distribution to use for calculating the random matrix.\n"
484: + "Sparse1 is:\n"
485: + " sqrt(3) * { -1 with prob(1/6), \n"
486: + " 0 with prob(2/3), \n"
487: + " +1 with prob(1/6) } \n"
488: + "Sparse2 is:\n"
489: + " { -1 with prob(1/2), \n"
490: + " +1 with prob(1/2) } ";
491:
492: }
493:
494: /**
495: * Sets the distribution to use for calculating the random matrix
496: *
497: * @param newDstr the distribution to use
498: */
499: public void setDistribution(SelectedTag newDstr) {
500:
501: if (newDstr.getTags() == TAGS_DSTRS_TYPE) {
502: m_distribution = newDstr.getSelectedTag().getID();
503: }
504: }
505:
506: /**
507: * Returns the current distribution that'll be used for calculating the
508: * random matrix
509: *
510: * @return the current distribution
511: */
512: public SelectedTag getDistribution() {
513: return new SelectedTag(m_distribution, TAGS_DSTRS_TYPE);
514: }
515:
516: /**
517: * Returns the tip text for this property
518: *
519: * @return tip text for this property suitable for
520: * displaying in the explorer/experimenter gui
521: */
522: public String replaceMissingValuesTipText() {
523:
524: return "If set the filter uses weka.filters.unsupervised.attribute.ReplaceMissingValues"
525: + " to replace the missing values";
526: }
527:
528: /**
529: * Sets either to use replace missing values filter or not
530: *
531: * @param t if true then the replace missing values is used
532: */
533: public void setReplaceMissingValues(boolean t) {
534: m_useReplaceMissing = t;
535: }
536:
537: /**
538: * Gets the current setting for using ReplaceMissingValues filter
539: *
540: * @return true if the replace missing values filter is used
541: */
542: public boolean getReplaceMissingValues() {
543: return m_useReplaceMissing;
544: }
545:
546: /**
547: * Returns the Capabilities of this filter.
548: *
549: * @return the capabilities of this object
550: * @see Capabilities
551: */
552: public Capabilities getCapabilities() {
553: Capabilities result = super .getCapabilities();
554:
555: // attributes
556: result.enableAllAttributes();
557: result.enable(Capability.MISSING_VALUES);
558:
559: // class
560: result.enableAllClasses();
561: result.enable(Capability.MISSING_CLASS_VALUES);
562: result.enable(Capability.NO_CLASS);
563:
564: return result;
565: }
566:
567: /**
568: * Sets the format of the input instances.
569: *
570: * @param instanceInfo an Instances object containing the input
571: * instance structure (any instances contained in the object are
572: * ignored - only the structure is required).
573: * @return true if the outputFormat may be collected immediately
574: * @throws Exception if the input format can't be set
575: * successfully
576: */
577: public boolean setInputFormat(Instances instanceInfo)
578: throws Exception {
579: super .setInputFormat(instanceInfo);
580: /*
581: if (instanceInfo.classIndex() < 0) {
582: throw new UnassignedClassException("No class has been assigned to the instances");
583: }
584: */
585:
586: for (int i = 0; i < instanceInfo.numAttributes(); i++) {
587: if (i != instanceInfo.classIndex()
588: && instanceInfo.attribute(i).isNominal()) {
589: if (instanceInfo.classIndex() >= 0)
590: m_ntob = new weka.filters.supervised.attribute.NominalToBinary();
591: else
592: m_ntob = new weka.filters.unsupervised.attribute.NominalToBinary();
593:
594: break;
595: }
596: }
597:
598: //r.setSeed(m_rndmSeed); //in case the setRandomSeed() is not
599: //called we better set the seed to its
600: //default value of 42.
601: boolean temp = true;
602: if (m_replaceMissing != null) {
603: m_replaceMissing = new weka.filters.unsupervised.attribute.ReplaceMissingValues();
604: if (m_replaceMissing.setInputFormat(instanceInfo))
605: temp = true;
606: else
607: temp = false;
608: }
609:
610: if (m_ntob != null) {
611: if (m_ntob.setInputFormat(instanceInfo)) {
612: setOutputFormat();
613: return temp && true;
614: } else {
615: return false;
616: }
617: } else {
618: setOutputFormat();
619: return temp && true;
620: }
621: }
622:
623: /**
624: * Input an instance for filtering.
625: *
626: * @param instance the input instance
627: * @return true if the filtered instance may now be
628: * collected with output().
629: * @throws IllegalStateException if no input format has been set
630: */
631: public boolean input(Instance instance) throws Exception {
632:
633: Instance newInstance = null;
634:
635: if (getInputFormat() == null) {
636: throw new IllegalStateException(
637: "No input instance format defined");
638: }
639: if (m_NewBatch) {
640: resetQueue();
641: //if(ntob!=null)
642: // ntob.m_NewBatch=true;
643: m_NewBatch = false;
644: }
645:
646: boolean replaceDone = false;
647: if (m_replaceMissing != null) {
648: if (m_replaceMissing.input(instance)) {
649: if (m_OutputFormatDefined == false)
650: setOutputFormat();
651: newInstance = m_replaceMissing.output();
652: replaceDone = true;
653: } else
654: return false;
655: ;
656: }
657:
658: if (m_ntob != null) {
659: if (replaceDone == false)
660: newInstance = instance;
661: if (m_ntob.input(newInstance)) {
662: if (m_OutputFormatDefined == false)
663: setOutputFormat();
664: newInstance = m_ntob.output();
665: newInstance = convertInstance(newInstance);
666: push(newInstance);
667: return true;
668: } else {
669: return false;
670: }
671: } else {
672: if (replaceDone == false)
673: newInstance = instance;
674: newInstance = convertInstance(newInstance);
675: push(newInstance);
676: return true;
677: }
678: }
679:
680: /**
681: * Signify that this batch of input to the filter is finished.
682: *
683: * @return true if there are instances pending output
684: * @throws NullPointerException if no input structure has been defined,
685: * @throws Exception if there was a problem finishing the batch.
686: */
687: public boolean batchFinished() throws Exception {
688: if (getInputFormat() == null) {
689: throw new NullPointerException(
690: "No input instance format defined");
691: }
692:
693: boolean conversionDone = false;
694: if (m_replaceMissing != null) {
695: if (m_replaceMissing.batchFinished()) {
696: Instance newInstance, instance;
697:
698: while ((instance = m_replaceMissing.output()) != null) {
699: if (!m_OutputFormatDefined)
700: setOutputFormat();
701: if (m_ntob != null) {
702: m_ntob.input(instance);
703: } else {
704: newInstance = convertInstance(instance);
705: push(newInstance);
706: }
707: }
708:
709: if (m_ntob != null) {
710: if (m_ntob.batchFinished()) {
711: //Instance newInstance, instance;
712: while ((instance = m_ntob.output()) != null) {
713: if (!m_OutputFormatDefined)
714: setOutputFormat();
715: newInstance = convertInstance(instance);
716: push(newInstance);
717: }
718: m_ntob = null;
719: }
720: }
721: m_replaceMissing = null;
722: conversionDone = true;
723: }
724: }
725:
726: if (conversionDone == false && m_ntob != null) {
727: if (m_ntob.batchFinished()) {
728: Instance newInstance, instance;
729: while ((instance = m_ntob.output()) != null) {
730: if (!m_OutputFormatDefined)
731: setOutputFormat();
732: newInstance = convertInstance(instance);
733: push(newInstance);
734: }
735: m_ntob = null;
736: }
737: }
738: m_OutputFormatDefined = false;
739: return super .batchFinished();
740: }
741:
742: /** Sets the output format */
743: protected void setOutputFormat() {
744: Instances currentFormat;
745: if (m_ntob != null) {
746: currentFormat = m_ntob.getOutputFormat();
747: } else
748: currentFormat = getInputFormat();
749:
750: if (m_percent > 0) {
751: m_k = (int) ((getInputFormat().numAttributes() - 1) * m_percent);
752: // System.out.print("numAtts: "+currentFormat.numAttributes());
753: // System.out.print("percent: "+m_percent);
754: // System.out.print("percent*numAtts: "+(currentFormat.numAttributes()*m_percent));
755: // System.out.println("m_k: "+m_k);
756: }
757:
758: Instances newFormat;
759: int newClassIndex = -1;
760: FastVector attributes = new FastVector();
761: for (int i = 0; i < m_k; i++) {
762: attributes.addElement(new Attribute("K" + (i + 1)));
763: }
764: if (currentFormat.classIndex() != -1) { //if classindex is set
765: //attributes.removeElementAt(attributes.size()-1);
766: attributes.addElement(currentFormat.attribute(currentFormat
767: .classIndex()));
768: newClassIndex = attributes.size() - 1;
769: }
770:
771: newFormat = new Instances(currentFormat.relationName(),
772: attributes, 0);
773: if (newClassIndex != -1)
774: newFormat.setClassIndex(newClassIndex);
775: m_OutputFormatDefined = true;
776:
777: m_random = new Random();
778: m_random.setSeed(m_rndmSeed);
779:
780: m_rmatrix = new double[m_k][currentFormat.numAttributes()];
781: if (m_distribution == GAUSSIAN) {
782: for (int i = 0; i < m_rmatrix.length; i++)
783: for (int j = 0; j < m_rmatrix[i].length; j++)
784: m_rmatrix[i][j] = m_random.nextGaussian();
785: } else {
786: boolean useDstrWithZero = (m_distribution == SPARSE1);
787: for (int i = 0; i < m_rmatrix.length; i++)
788: for (int j = 0; j < m_rmatrix[i].length; j++)
789: m_rmatrix[i][j] = rndmNum(useDstrWithZero);
790: }
791:
792: setOutputFormat(newFormat);
793: }
794:
795: /**
796: * converts a single instance to the required format
797: *
798: * @param currentInstance the instance to convert
799: * @return the converted instance
800: */
801: protected Instance convertInstance(Instance currentInstance) {
802:
803: Instance newInstance;
804: double vals[] = new double[getOutputFormat().numAttributes()];
805: int classIndex = (m_ntob == null) ? getInputFormat()
806: .classIndex() : m_ntob.getOutputFormat().classIndex();
807:
808: for (int i = 0; i < m_k; i++) {
809: vals[i] = computeRandomProjection(i, classIndex,
810: currentInstance);
811: }
812: if (classIndex != -1) {
813: vals[m_k] = currentInstance.value(classIndex);
814: }
815:
816: newInstance = new Instance(currentInstance.weight(), vals);
817: newInstance.setDataset(getOutputFormat());
818:
819: return newInstance;
820: }
821:
822: /**
823: * computes one random projection for a given instance (skip missing values)
824: *
825: * @param rpIndex offset the new random projection attribute
826: * @param classIndex classIndex of the input instance
827: * @param instance the instance to convert
828: * @return the random sum
829: */
830:
831: protected double computeRandomProjection(int rpIndex,
832: int classIndex, Instance instance) {
833:
834: double sum = 0.0;
835: for (int i = 0; i < instance.numValues(); i++) {
836: int index = instance.index(i);
837: if (index != classIndex) {
838: double value = instance.valueSparse(i);
839: if (!Instance.isMissingValue(value)) {
840: sum += m_rmatrix[rpIndex][index] * value;
841: }
842: }
843: }
844: return sum;
845: }
846:
847: private static final int weights[] = { 1, 1, 4 };
848: private static final int vals[] = { -1, 1, 0 };
849: private static final int weights2[] = { 1, 1 };
850: private static final int vals2[] = { -1, 1 };
851: private static final double sqrt3 = Math.sqrt(3);
852:
853: /**
854: * returns a double x such that <br/>
855: * x = sqrt(3) * { -1 with prob. 1/6, 0 with prob. 2/3, 1 with prob. 1/6 }
856: *
857: * @param useDstrWithZero
858: * @return the generated number
859: */
860: protected double rndmNum(boolean useDstrWithZero) {
861: if (useDstrWithZero)
862: return sqrt3 * vals[weightedDistribution(weights)];
863: else
864: return vals2[weightedDistribution(weights2)];
865: }
866:
867: /**
868: * Calculates a weighted distribution
869: *
870: * @param weights the weights to use
871: * @return
872: */
873: protected int weightedDistribution(int[] weights) {
874: int sum = 0;
875:
876: for (int i = 0; i < weights.length; i++)
877: sum += weights[i];
878:
879: int val = (int) Math.floor(m_random.nextDouble() * sum);
880:
881: for (int i = 0; i < weights.length; i++) {
882: val -= weights[i];
883: if (val < 0)
884: return i;
885: }
886: return -1;
887: }
888:
889: /**
890: * Main method for testing this class.
891: *
892: * @param argv should contain arguments to the filter:
893: * use -h for help
894: */
895: public static void main(String[] argv) {
896: runFilter(new RandomProjection(), argv);
897: }
898: }
|