001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * AttributeSelectedClassifier.java
019: * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.classifiers.meta;
024:
025: import weka.attributeSelection.ASEvaluation;
026: import weka.attributeSelection.ASSearch;
027: import weka.attributeSelection.AttributeSelection;
028: import weka.classifiers.SingleClassifierEnhancer;
029: import weka.core.AdditionalMeasureProducer;
030: import weka.core.Capabilities;
031: import weka.core.Drawable;
032: import weka.core.Instance;
033: import weka.core.Instances;
034: import weka.core.Option;
035: import weka.core.OptionHandler;
036: import weka.core.Utils;
037: import weka.core.WeightedInstancesHandler;
038: import weka.core.Capabilities.Capability;
039:
040: import java.util.Enumeration;
041: import java.util.Random;
042: import java.util.Vector;
043:
044: /**
045: <!-- globalinfo-start -->
046: * Dimensionality of training and test data is reduced by attribute selection before being passed on to a classifier.
047: * <p/>
048: <!-- globalinfo-end -->
049: *
050: <!-- options-start -->
051: * Valid options are: <p/>
052: *
053: * <pre> -E <attribute evaluator specification>
054: * Full class name of attribute evaluator, followed
055: * by its options.
056: * eg: "weka.attributeSelection.CfsSubsetEval -L"
057: * (default weka.attributeSelection.CfsSubsetEval)</pre>
058: *
059: * <pre> -S <search method specification>
060: * Full class name of search method, followed
061: * by its options.
062: * eg: "weka.attributeSelection.BestFirst -D 1"
063: * (default weka.attributeSelection.BestFirst)</pre>
064: *
065: * <pre> -D
066: * If set, classifier is run in debug mode and
067: * may output additional info to the console</pre>
068: *
069: * <pre> -W
070: * Full name of base classifier.
071: * (default: weka.classifiers.trees.J48)</pre>
072: *
073: * <pre>
074: * Options specific to classifier weka.classifiers.trees.J48:
075: * </pre>
076: *
077: * <pre> -U
078: * Use unpruned tree.</pre>
079: *
080: * <pre> -C <pruning confidence>
081: * Set confidence threshold for pruning.
082: * (default 0.25)</pre>
083: *
084: * <pre> -M <minimum number of instances>
085: * Set minimum number of instances per leaf.
086: * (default 2)</pre>
087: *
088: * <pre> -R
089: * Use reduced error pruning.</pre>
090: *
091: * <pre> -N <number of folds>
092: * Set number of folds for reduced error
093: * pruning. One fold is used as pruning set.
094: * (default 3)</pre>
095: *
096: * <pre> -B
097: * Use binary splits only.</pre>
098: *
099: * <pre> -S
100: * Don't perform subtree raising.</pre>
101: *
102: * <pre> -L
103: * Do not clean up after the tree has been built.</pre>
104: *
105: * <pre> -A
106: * Laplace smoothing for predicted probabilities.</pre>
107: *
108: * <pre> -Q <seed>
109: * Seed for random data shuffling (default 1).</pre>
110: *
111: <!-- options-end -->
112: *
113: * @author Mark Hall (mhall@cs.waikato.ac.nz)
114: * @version $Revision: 1.25 $
115: */
116: public class AttributeSelectedClassifier extends
117: SingleClassifierEnhancer implements OptionHandler, Drawable,
118: AdditionalMeasureProducer, WeightedInstancesHandler {
119:
120: /** for serialization */
121: static final long serialVersionUID = -5951805453487947577L;
122:
123: /** The attribute selection object */
124: protected AttributeSelection m_AttributeSelection = null;
125:
126: /** The attribute evaluator to use */
127: protected ASEvaluation m_Evaluator = new weka.attributeSelection.CfsSubsetEval();
128:
129: /** The search method to use */
130: protected ASSearch m_Search = new weka.attributeSelection.BestFirst();
131:
132: /** The header of the dimensionally reduced data */
133: protected Instances m_ReducedHeader;
134:
135: /** The number of class vals in the training data (1 if class is numeric) */
136: protected int m_numClasses;
137:
138: /** The number of attributes selected by the attribute selection phase */
139: protected double m_numAttributesSelected;
140:
141: /** The time taken to select attributes in milliseconds */
142: protected double m_selectionTime;
143:
144: /** The time taken to select attributes AND build the classifier */
145: protected double m_totalTime;
146:
147: /**
148: * String describing default classifier.
149: *
150: * @return the default classifier classname
151: */
152: protected String defaultClassifierString() {
153:
154: return "weka.classifiers.trees.J48";
155: }
156:
157: /**
158: * Default constructor.
159: */
160: public AttributeSelectedClassifier() {
161: m_Classifier = new weka.classifiers.trees.J48();
162: }
163:
164: /**
165: * Returns a string describing this search method
166: * @return a description of the search method suitable for
167: * displaying in the explorer/experimenter gui
168: */
169: public String globalInfo() {
170: return "Dimensionality of training and test data is reduced by "
171: + "attribute selection before being passed on to a classifier.";
172: }
173:
174: /**
175: * Returns an enumeration describing the available options.
176: *
177: * @return an enumeration of all the available options.
178: */
179: public Enumeration listOptions() {
180: Vector newVector = new Vector(3);
181:
182: newVector
183: .addElement(new Option(
184: "\tFull class name of attribute evaluator, followed\n"
185: + "\tby its options.\n"
186: + "\teg: \"weka.attributeSelection.CfsSubsetEval -L\"\n"
187: + "\t(default weka.attributeSelection.CfsSubsetEval)",
188: "E", 1,
189: "-E <attribute evaluator specification>"));
190:
191: newVector
192: .addElement(new Option(
193: "\tFull class name of search method, followed\n"
194: + "\tby its options.\n"
195: + "\teg: \"weka.attributeSelection.BestFirst -D 1\"\n"
196: + "\t(default weka.attributeSelection.BestFirst)",
197: "S", 1, "-S <search method specification>"));
198:
199: Enumeration enu = super .listOptions();
200: while (enu.hasMoreElements()) {
201: newVector.addElement(enu.nextElement());
202: }
203: return newVector.elements();
204: }
205:
206: /**
207: * Parses a given list of options. <p/>
208: *
209: <!-- options-start -->
210: * Valid options are: <p/>
211: *
212: * <pre> -E <attribute evaluator specification>
213: * Full class name of attribute evaluator, followed
214: * by its options.
215: * eg: "weka.attributeSelection.CfsSubsetEval -L"
216: * (default weka.attributeSelection.CfsSubsetEval)</pre>
217: *
218: * <pre> -S <search method specification>
219: * Full class name of search method, followed
220: * by its options.
221: * eg: "weka.attributeSelection.BestFirst -D 1"
222: * (default weka.attributeSelection.BestFirst)</pre>
223: *
224: * <pre> -D
225: * If set, classifier is run in debug mode and
226: * may output additional info to the console</pre>
227: *
228: * <pre> -W
229: * Full name of base classifier.
230: * (default: weka.classifiers.trees.J48)</pre>
231: *
232: * <pre>
233: * Options specific to classifier weka.classifiers.trees.J48:
234: * </pre>
235: *
236: * <pre> -U
237: * Use unpruned tree.</pre>
238: *
239: * <pre> -C <pruning confidence>
240: * Set confidence threshold for pruning.
241: * (default 0.25)</pre>
242: *
243: * <pre> -M <minimum number of instances>
244: * Set minimum number of instances per leaf.
245: * (default 2)</pre>
246: *
247: * <pre> -R
248: * Use reduced error pruning.</pre>
249: *
250: * <pre> -N <number of folds>
251: * Set number of folds for reduced error
252: * pruning. One fold is used as pruning set.
253: * (default 3)</pre>
254: *
255: * <pre> -B
256: * Use binary splits only.</pre>
257: *
258: * <pre> -S
259: * Don't perform subtree raising.</pre>
260: *
261: * <pre> -L
262: * Do not clean up after the tree has been built.</pre>
263: *
264: * <pre> -A
265: * Laplace smoothing for predicted probabilities.</pre>
266: *
267: * <pre> -Q <seed>
268: * Seed for random data shuffling (default 1).</pre>
269: *
270: <!-- options-end -->
271: *
272: * @param options the list of options as an array of strings
273: * @throws Exception if an option is not supported
274: */
275: public void setOptions(String[] options) throws Exception {
276:
277: // same for attribute evaluator
278: String evaluatorString = Utils.getOption('E', options);
279: if (evaluatorString.length() == 0)
280: evaluatorString = weka.attributeSelection.CfsSubsetEval.class
281: .getName();
282: String[] evaluatorSpec = Utils.splitOptions(evaluatorString);
283: if (evaluatorSpec.length == 0) {
284: throw new Exception(
285: "Invalid attribute evaluator specification string");
286: }
287: String evaluatorName = evaluatorSpec[0];
288: evaluatorSpec[0] = "";
289: setEvaluator(ASEvaluation.forName(evaluatorName, evaluatorSpec));
290:
291: // same for search method
292: String searchString = Utils.getOption('S', options);
293: if (searchString.length() == 0)
294: searchString = weka.attributeSelection.BestFirst.class
295: .getName();
296: String[] searchSpec = Utils.splitOptions(searchString);
297: if (searchSpec.length == 0) {
298: throw new Exception("Invalid search specification string");
299: }
300: String searchName = searchSpec[0];
301: searchSpec[0] = "";
302: setSearch(ASSearch.forName(searchName, searchSpec));
303:
304: super .setOptions(options);
305: }
306:
307: /**
308: * Gets the current settings of the Classifier.
309: *
310: * @return an array of strings suitable for passing to setOptions
311: */
312: public String[] getOptions() {
313:
314: String[] super Options = super .getOptions();
315: String[] options = new String[super Options.length + 4];
316:
317: int current = 0;
318:
319: // same attribute evaluator
320: options[current++] = "-E";
321: options[current++] = "" + getEvaluatorSpec();
322:
323: // same for search
324: options[current++] = "-S";
325: options[current++] = "" + getSearchSpec();
326:
327: System.arraycopy(super Options, 0, options, current,
328: super Options.length);
329:
330: return options;
331: }
332:
333: /**
334: * Returns the tip text for this property
335: * @return tip text for this property suitable for
336: * displaying in the explorer/experimenter gui
337: */
338: public String evaluatorTipText() {
339: return "Set the attribute evaluator to use. This evaluator is used "
340: + "during the attribute selection phase before the classifier is "
341: + "invoked.";
342: }
343:
344: /**
345: * Sets the attribute evaluator
346: *
347: * @param evaluator the evaluator with all options set.
348: */
349: public void setEvaluator(ASEvaluation evaluator) {
350: m_Evaluator = evaluator;
351: }
352:
353: /**
354: * Gets the attribute evaluator used
355: *
356: * @return the attribute evaluator
357: */
358: public ASEvaluation getEvaluator() {
359: return m_Evaluator;
360: }
361:
362: /**
363: * Gets the evaluator specification string, which contains the class name of
364: * the attribute evaluator and any options to it
365: *
366: * @return the evaluator string.
367: */
368: protected String getEvaluatorSpec() {
369:
370: ASEvaluation e = getEvaluator();
371: if (e instanceof OptionHandler) {
372: return e.getClass().getName()
373: + " "
374: + Utils.joinOptions(((OptionHandler) e)
375: .getOptions());
376: }
377: return e.getClass().getName();
378: }
379:
380: /**
381: * Returns the tip text for this property
382: * @return tip text for this property suitable for
383: * displaying in the explorer/experimenter gui
384: */
385: public String searchTipText() {
386: return "Set the search method. This search method is used "
387: + "during the attribute selection phase before the classifier is "
388: + "invoked.";
389: }
390:
391: /**
392: * Sets the search method
393: *
394: * @param search the search method with all options set.
395: */
396: public void setSearch(ASSearch search) {
397: m_Search = search;
398: }
399:
400: /**
401: * Gets the search method used
402: *
403: * @return the search method
404: */
405: public ASSearch getSearch() {
406: return m_Search;
407: }
408:
409: /**
410: * Gets the search specification string, which contains the class name of
411: * the search method and any options to it
412: *
413: * @return the search string.
414: */
415: protected String getSearchSpec() {
416:
417: ASSearch s = getSearch();
418: if (s instanceof OptionHandler) {
419: return s.getClass().getName()
420: + " "
421: + Utils.joinOptions(((OptionHandler) s)
422: .getOptions());
423: }
424: return s.getClass().getName();
425: }
426:
427: /**
428: * Returns default capabilities of the classifier.
429: *
430: * @return the capabilities of this classifier
431: */
432: public Capabilities getCapabilities() {
433: Capabilities result;
434:
435: if (getEvaluator() == null)
436: result = super .getCapabilities();
437: else
438: result = getEvaluator().getCapabilities();
439:
440: // set dependencies
441: for (Capability cap : Capability.values())
442: result.enableDependency(cap);
443:
444: return result;
445: }
446:
447: /**
448: * Build the classifier on the dimensionally reduced data.
449: *
450: * @param data the training data
451: * @throws Exception if the classifier could not be built successfully
452: */
453: public void buildClassifier(Instances data) throws Exception {
454: if (m_Classifier == null) {
455: throw new Exception("No base classifier has been set!");
456: }
457:
458: if (m_Evaluator == null) {
459: throw new Exception("No attribute evaluator has been set!");
460: }
461:
462: if (m_Search == null) {
463: throw new Exception("No search method has been set!");
464: }
465:
466: // can classifier handle the data?
467: getCapabilities().testWithFail(data);
468:
469: // remove instances with missing class
470: Instances newData = new Instances(data);
471: newData.deleteWithMissingClass();
472:
473: if (newData.numInstances() == 0) {
474: m_Classifier.buildClassifier(newData);
475: return;
476: }
477: if (newData.classAttribute().isNominal()) {
478: m_numClasses = newData.classAttribute().numValues();
479: } else {
480: m_numClasses = 1;
481: }
482:
483: Instances resampledData = null;
484: // check to see if training data has all equal weights
485: double weight = newData.instance(0).weight();
486: boolean ok = false;
487: for (int i = 1; i < newData.numInstances(); i++) {
488: if (newData.instance(i).weight() != weight) {
489: ok = true;
490: break;
491: }
492: }
493:
494: if (ok) {
495: if (!(m_Evaluator instanceof WeightedInstancesHandler)
496: || !(m_Classifier instanceof WeightedInstancesHandler)) {
497: Random r = new Random(1);
498: for (int i = 0; i < 10; i++) {
499: r.nextDouble();
500: }
501: resampledData = newData.resampleWithWeights(r);
502: }
503: } else {
504: // all equal weights in the training data so just use as is
505: resampledData = newData;
506: }
507:
508: m_AttributeSelection = new AttributeSelection();
509: m_AttributeSelection.setEvaluator(m_Evaluator);
510: m_AttributeSelection.setSearch(m_Search);
511: long start = System.currentTimeMillis();
512: m_AttributeSelection
513: .SelectAttributes((m_Evaluator instanceof WeightedInstancesHandler) ? newData
514: : resampledData);
515: long end = System.currentTimeMillis();
516: if (m_Classifier instanceof WeightedInstancesHandler) {
517: newData = m_AttributeSelection
518: .reduceDimensionality(newData);
519: m_Classifier.buildClassifier(newData);
520: } else {
521: resampledData = m_AttributeSelection
522: .reduceDimensionality(resampledData);
523: m_Classifier.buildClassifier(resampledData);
524: }
525:
526: long end2 = System.currentTimeMillis();
527: m_numAttributesSelected = m_AttributeSelection
528: .numberAttributesSelected();
529: m_ReducedHeader = new Instances(
530: (m_Classifier instanceof WeightedInstancesHandler) ? newData
531: : resampledData, 0);
532: m_selectionTime = (double) (end - start);
533: m_totalTime = (double) (end2 - start);
534: }
535:
536: /**
537: * Classifies a given instance after attribute selection
538: *
539: * @param instance the instance to be classified
540: * @return the class distribution
541: * @throws Exception if instance could not be classified
542: * successfully
543: */
544: public double[] distributionForInstance(Instance instance)
545: throws Exception {
546:
547: Instance newInstance;
548: if (m_AttributeSelection == null) {
549: // throw new Exception("AttributeSelectedClassifier: No model built yet!");
550: newInstance = instance;
551: } else {
552: newInstance = m_AttributeSelection
553: .reduceDimensionality(instance);
554: }
555:
556: return m_Classifier.distributionForInstance(newInstance);
557: }
558:
559: /**
560: * Returns the type of graph this classifier
561: * represents.
562: *
563: * @return the type of graph
564: */
565: public int graphType() {
566:
567: if (m_Classifier instanceof Drawable)
568: return ((Drawable) m_Classifier).graphType();
569: else
570: return Drawable.NOT_DRAWABLE;
571: }
572:
573: /**
574: * Returns graph describing the classifier (if possible).
575: *
576: * @return the graph of the classifier in dotty format
577: * @throws Exception if the classifier cannot be graphed
578: */
579: public String graph() throws Exception {
580:
581: if (m_Classifier instanceof Drawable)
582: return ((Drawable) m_Classifier).graph();
583: else
584: throw new Exception("Classifier: " + getClassifierSpec()
585: + " cannot be graphed");
586: }
587:
588: /**
589: * Output a representation of this classifier
590: *
591: * @return a representation of this classifier
592: */
593: public String toString() {
594: if (m_AttributeSelection == null) {
595: return "AttributeSelectedClassifier: No attribute selection possible.\n\n"
596: + m_Classifier.toString();
597: }
598:
599: StringBuffer result = new StringBuffer();
600: result.append("AttributeSelectedClassifier:\n\n");
601: result.append(m_AttributeSelection.toResultsString());
602: result.append("\n\nHeader of reduced data:\n"
603: + m_ReducedHeader.toString());
604: result.append("\n\nClassifier Model\n"
605: + m_Classifier.toString());
606:
607: return result.toString();
608: }
609:
610: /**
611: * Additional measure --- number of attributes selected
612: * @return the number of attributes selected
613: */
614: public double measureNumAttributesSelected() {
615: return m_numAttributesSelected;
616: }
617:
618: /**
619: * Additional measure --- time taken (milliseconds) to select the attributes
620: * @return the time taken to select attributes
621: */
622: public double measureSelectionTime() {
623: return m_selectionTime;
624: }
625:
626: /**
627: * Additional measure --- time taken (milliseconds) to select attributes
628: * and build the classifier
629: * @return the total time (select attributes + build classifier)
630: */
631: public double measureTime() {
632: return m_totalTime;
633: }
634:
635: /**
636: * Returns an enumeration of the additional measure names
637: * @return an enumeration of the measure names
638: */
639: public Enumeration enumerateMeasures() {
640: Vector newVector = new Vector(3);
641: newVector.addElement("measureNumAttributesSelected");
642: newVector.addElement("measureSelectionTime");
643: newVector.addElement("measureTime");
644: if (m_Classifier instanceof AdditionalMeasureProducer) {
645: Enumeration en = ((AdditionalMeasureProducer) m_Classifier)
646: .enumerateMeasures();
647: while (en.hasMoreElements()) {
648: String mname = (String) en.nextElement();
649: newVector.addElement(mname);
650: }
651: }
652: return newVector.elements();
653: }
654:
655: /**
656: * Returns the value of the named measure
657: * @param additionalMeasureName the name of the measure to query for its value
658: * @return the value of the named measure
659: * @throws IllegalArgumentException if the named measure is not supported
660: */
661: public double getMeasure(String additionalMeasureName) {
662: if (additionalMeasureName
663: .compareToIgnoreCase("measureNumAttributesSelected") == 0) {
664: return measureNumAttributesSelected();
665: } else if (additionalMeasureName
666: .compareToIgnoreCase("measureSelectionTime") == 0) {
667: return measureSelectionTime();
668: } else if (additionalMeasureName
669: .compareToIgnoreCase("measureTime") == 0) {
670: return measureTime();
671: } else if (m_Classifier instanceof AdditionalMeasureProducer) {
672: return ((AdditionalMeasureProducer) m_Classifier)
673: .getMeasure(additionalMeasureName);
674: } else {
675: throw new IllegalArgumentException(additionalMeasureName
676: + " not supported (AttributeSelectedClassifier)");
677: }
678: }
679:
680: /**
681: * Main method for testing this class.
682: *
683: * @param argv should contain the following arguments:
684: * -t training file [-T test file] [-c class index]
685: */
686: public static void main(String[] argv) {
687: runClassifier(new AttributeSelectedClassifier(), argv);
688: }
689: }
|