0001: /*
0002: * This program is free software; you can redistribute it and/or modify
0003: * it under the terms of the GNU General Public License as published by
0004: * the Free Software Foundation; either version 2 of the License, or
0005: * (at your option) any later version.
0006: *
0007: * This program is distributed in the hope that it will be useful,
0008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0010: * GNU General Public License for more details.
0011: *
0012: * You should have received a copy of the GNU General Public License
0013: * along with this program; if not, write to the Free Software
0014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0015: */
0016:
0017: /*
0018: * CheckEstimator.java
0019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
0020: *
0021: */
0022:
0023: package weka.estimators;
0024:
0025: import weka.core.Attribute;
0026: import weka.core.FastVector;
0027: import weka.core.Instance;
0028: import weka.core.Instances;
0029: import weka.core.Option;
0030: import weka.core.OptionHandler;
0031: import weka.core.TestInstances;
0032: import weka.core.Utils;
0033: import weka.core.WeightedInstancesHandler;
0034:
0035: import java.util.Enumeration;
0036: import java.util.Random;
0037: import java.util.Vector;
0038:
0039: /**
0040: * Class for examining the capabilities and finding problems with
0041: * estimators. If you implement a estimator using the WEKA.libraries,
0042: * you should run the checks on it to ensure robustness and correct
0043: * operation. Passing all the tests of this object does not mean
0044: * bugs in the estimator don't exist, but this will help find some
0045: * common ones. <p/>
0046: *
0047: * Typical usage: <p/>
0048: * <code>java weka.estimators.CheckEstimator -W estimator_name
0049: * estimator_options </code><p/>
0050: *
0051: * This class uses code from the CheckEstimatorClass
0052: * ATTENTION! Current estimators can only
0053: * 1. split on a nominal class attribute
0054: * 2. build estimators for nominal and numeric attributes
0055: * 3. build estimators independendly of the class type
0056: * The functionality to test on other class and attribute types
0057: * is left in big parts in the code.
0058: *
0059: * CheckEstimator reports on the following:
0060: * <ul>
0061: * <li> Estimator abilities
0062: * <ul>
0063: * <li> Possible command line options to the estimator </li>
0064: * <li> Whether the estimator can predict nominal, numeric, string,
0065: * date or relational class attributes. Warnings will be displayed if
0066: * performance is worse than ZeroR </li>
0067: * <li> Whether the estimator can be trained incrementally </li>
0068: * <li> Whether the estimator can build estimates for numeric attributes </li>
0069: * <li> Whether the estimator can handle nominal attributes </li>
0070: * <li> Whether the estimator can handle string attributes </li>
0071: * <li> Whether the estimator can handle date attributes </li>
0072: * <li> Whether the estimator can handle relational attributes </li>
0073: * <li> Whether the estimator build estimates for multi-instance data </li>
0074: * <li> Whether the estimator can handle missing attribute values </li>
0075: * <li> Whether the estimator can handle missing class values </li>
0076: * <li> Whether a nominal estimator only handles 2 class problems </li>
0077: * <li> Whether the estimator can handle instance weights </li>
0078: * </ul>
0079: * </li>
0080: * <li> Correct functioning
0081: * <ul>
0082: * <li> Correct initialisation during addvalues (i.e. no result
0083: * changes when addValues called repeatedly) </li>
0084: * <li> Whether incremental training produces the same results
0085: * as during non-incremental training (which may or may not
0086: * be OK) </li>
0087: * <li> Whether the estimator alters the data pased to it
0088: * (number of instances, instance order, instance weights, etc) </li>
0089: * </ul>
0090: * </li>
0091: * <li> Degenerate cases
0092: * <ul>
0093: * <li> building estimator with zero training instances </li>
0094: * <li> all but one attribute attribute values missing </li>
0095: * <li> all attribute attribute values missing </li>
0096: * <li> all but one class values missing </li>
0097: * <li> all class values missing </li>
0098: * </ul>
0099: * </li>
0100: * </ul>
0101: * Running CheckEstimator with the debug option set will output the
0102: * training and test datasets for any failed tests.<p/>
0103: *
0104: * The <code>weka.estimators.AbstractEstimatorTest</code> uses this
0105: * class to test all the estimators. Any changes here, have to be
0106: * checked in that abstract test class, too. <p/>
0107: *
0108: <!-- options-start -->
0109: * Valid options are: <p/>
0110: *
0111: * <pre> -D
0112: * Turn on debugging output.</pre>
0113: *
0114: * <pre> -S
0115: * Silent mode - prints nothing to stdout.</pre>
0116: *
0117: * <pre> -N <num>
0118: * The number of instances in the datasets (default 100).</pre>
0119: *
0120: * <pre> -W
0121: * Full name of the estimator analysed.
0122: * eg: weka.estimators.bayes.NaiveBayes</pre>
0123: *
0124: * <pre>
0125: * Options specific to estimator weka.estimators.rules.ZeroR:
0126: * </pre>
0127: *
0128: * <pre> -D
0129: * If set, estimator is run in debug mode and
0130: * may output additional info to the console</pre>
0131: *
0132: <!-- options-end -->
0133: *
0134: * Options after -- are passed to the designated estimator.<p/>
0135: *
0136: * @author Len Trigg (trigg@cs.waikato.ac.nz)
0137: * @author FracPete (fracpete at waikato dot ac dot nz)
0138: * @version $Revision: 1.3 $
0139: * @see TestInstances
0140: */
0141: public class CheckEstimator implements OptionHandler {
0142:
0143: /*
0144: * Note about test methods:
0145: * - methods return array of booleans
0146: * - first index: success or not
0147: * - second index: acceptable or not (e.g., Exception is OK)
0148: * - in case the performance is worse than that of ZeroR both indices are true
0149: *
0150: * FracPete (fracpete at waikato dot ac dot nz)
0151: */
0152:
0153: /** a class for postprocessing the test-data
0154: */
0155: public class PostProcessor {
0156: /**
0157: * Provides a hook for derived classes to further modify the data. Currently,
0158: * the data is just passed through.
0159: *
0160: * @param data the data to process
0161: * @return the processed data
0162: */
0163: protected Instances process(Instances data) {
0164: return data;
0165: }
0166: }
0167:
0168: /*** The estimator to be examined */
0169: protected Estimator m_Estimator = (Estimator) new weka.estimators.NormalEstimator(
0170: 0.000001);
0171:
0172: /** The options to be passed to the base estimator. */
0173: protected String[] m_EstimatorOptions;
0174:
0175: /** The results of the analysis as a string */
0176: protected String m_AnalysisResults;
0177:
0178: /** Debugging mode, gives extra output if true */
0179: protected boolean m_Debug = false;
0180:
0181: /** Silent mode, for no output at all to stdout */
0182: protected boolean m_Silent = false;
0183:
0184: /** The number of instances in the datasets */
0185: protected int m_NumInstances = 100;
0186:
0187: /** for post-processing the data even further */
0188: protected PostProcessor m_PostProcessor = null;
0189:
0190: /** whether classpath problems occurred */
0191: protected boolean m_ClasspathProblems = false;
0192:
0193: /**
0194: * class that contains info about the attribute types the estimator can estimate
0195: * estimator work on one attribute only
0196: */
0197: public static class AttrTypes {
0198: boolean nominal = false;
0199: boolean numeric = false;
0200: boolean string = false;
0201: boolean date = false;
0202: boolean relational = false;
0203:
0204: AttrTypes() {
0205: }
0206:
0207: AttrTypes(AttrTypes newTypes) {
0208: nominal = newTypes.nominal;
0209: numeric = newTypes.numeric;
0210: string = newTypes.string;
0211: date = newTypes.date;
0212: relational = newTypes.relational;
0213: }
0214:
0215: AttrTypes(int type) {
0216: if (type == Attribute.NOMINAL)
0217: nominal = true;
0218: if (type == Attribute.NUMERIC)
0219: numeric = true;
0220: if (type == Attribute.STRING)
0221: string = true;
0222: if (type == Attribute.DATE)
0223: date = true;
0224: if (type == Attribute.RELATIONAL)
0225: relational = true;
0226: }
0227:
0228: int getSetType() throws Exception {
0229: int sum = 0;
0230: int type = -1;
0231: if (nominal) {
0232: sum++;
0233: type = Attribute.NOMINAL;
0234: }
0235: if (numeric) {
0236: sum++;
0237: type = Attribute.NUMERIC;
0238: }
0239: if (string) {
0240: sum++;
0241: type = Attribute.STRING;
0242: }
0243: if (date) {
0244: sum++;
0245: type = Attribute.DATE;
0246: }
0247: if (relational) {
0248: sum++;
0249: type = Attribute.RELATIONAL;
0250: }
0251: if (sum > 1)
0252: throw new Exception(
0253: "Expected to have only one type set used wrongly.");
0254: if (type < 0)
0255: throw new Exception("No type set.");
0256: return type;
0257: }
0258:
0259: boolean oneIsSet() {
0260: return (nominal || numeric || string || date || relational);
0261: }
0262:
0263: public Vector getVectorOfAttrTypes() {
0264: Vector attrs = new Vector();
0265: if (nominal)
0266: attrs.add(new Integer(Attribute.NOMINAL));
0267: if (numeric)
0268: attrs.add(new Integer(Attribute.NUMERIC));
0269: if (string)
0270: attrs.add(new Integer(Attribute.STRING));
0271: if (date)
0272: attrs.add(new Integer(Attribute.DATE));
0273: if (relational)
0274: attrs.add(new Integer(Attribute.RELATIONAL));
0275: return attrs;
0276: }
0277: }
0278:
0279: /**
0280: * public class that contains info about the chosen attribute type
0281: * estimator work on one attribute only
0282: */
0283: public static class EstTypes {
0284: boolean incremental = false;
0285: boolean weighted = false;
0286: boolean super vised = false;
0287:
0288: /**
0289: * Constructor
0290: */
0291: public EstTypes() {
0292: }
0293:
0294: /**
0295: * Constructor
0296: */
0297: public EstTypes(boolean i, boolean w, boolean s) {
0298: incremental = i;
0299: weighted = w;
0300: super vised = s;
0301: }
0302: }
0303:
0304: /**
0305: * Returns an enumeration describing the available options.
0306: *
0307: * @return an enumeration of all the available options.
0308: */
0309: public Enumeration listOptions() {
0310:
0311: Vector newVector = new Vector(2);
0312:
0313: newVector.addElement(new Option("\tTurn on debugging output.",
0314: "D", 0, "-D"));
0315:
0316: newVector.addElement(new Option(
0317: "\tSilent mode - prints nothing to stdout.", "S", 0,
0318: "-S"));
0319:
0320: newVector
0321: .addElement(new Option(
0322: "\tThe number of instances in the datasets (default 100).",
0323: "N", 1, "-N <num>"));
0324:
0325: newVector.addElement(new Option(
0326: "\tFull name of the estimator analysed.\n"
0327: + "\teg: weka.estimators.NormalEstimator", "W",
0328: 1, "-W"));
0329:
0330: if ((m_Estimator != null)
0331: && (m_Estimator instanceof OptionHandler)) {
0332: newVector.addElement(new Option("", "", 0,
0333: "\nOptions specific to estimator "
0334: + m_Estimator.getClass().getName() + ":"));
0335: Enumeration enu = ((OptionHandler) m_Estimator)
0336: .listOptions();
0337: while (enu.hasMoreElements())
0338: newVector.addElement(enu.nextElement());
0339: }
0340:
0341: return newVector.elements();
0342: }
0343:
0344: /**
0345: * Parses a given list of options.
0346: *
0347: <!-- options-start -->
0348: * Valid options are: <p/>
0349: *
0350: * <pre> -D
0351: * Turn on debugging output.</pre>
0352: *
0353: * <pre> -S
0354: * Silent mode - prints nothing to stdout.</pre>
0355: *
0356: * <pre> -N <num>
0357: * The number of instances in the datasets (default 100).</pre>
0358: *
0359: * <pre> -W
0360: * Full name of the estimator analysed.
0361: * eg: weka.estimators.NormalEstimator</pre>
0362: *
0363: * <pre>
0364: * Options specific to estimator weka.estimators.NormalEstimator:
0365: * </pre>
0366: *
0367: * <pre> -D
0368: * If set, estimator is run in debug mode and
0369: * may output additional info to the console</pre>
0370: *
0371: <!-- options-end -->
0372: *
0373: * @param options the list of options as an array of strings
0374: * @throws Exception if an option is not supported
0375: */
0376: public void setOptions(String[] options) throws Exception {
0377: String tmpStr;
0378:
0379: setDebug(Utils.getFlag('D', options));
0380:
0381: setSilent(Utils.getFlag('S', options));
0382:
0383: tmpStr = Utils.getOption('N', options);
0384: if (tmpStr.length() != 0)
0385: setNumInstances(Integer.parseInt(tmpStr));
0386: else
0387: setNumInstances(100);
0388:
0389: tmpStr = Utils.getOption('W', options);
0390: if (tmpStr.length() == 0)
0391: throw new Exception(
0392: "A estimator must be specified with the -W option.");
0393: setEstimator(Estimator.forName(tmpStr, Utils
0394: .partitionOptions(options)));
0395: }
0396:
0397: /**
0398: * Gets the current settings of the CheckEstimator.
0399: *
0400: * @return an array of strings suitable for passing to setOptions
0401: */
0402: public String[] getOptions() {
0403: Vector result;
0404: String[] options;
0405: int i;
0406:
0407: result = new Vector();
0408:
0409: if (getDebug())
0410: result.add("-D");
0411:
0412: if (getSilent())
0413: result.add("-S");
0414:
0415: result.add("-N");
0416: result.add("" + getNumInstances());
0417:
0418: if (getEstimator() != null) {
0419: result.add("-W");
0420: result.add(getEstimator().getClass().getName());
0421: }
0422:
0423: if ((m_Estimator != null)
0424: && (m_Estimator instanceof OptionHandler))
0425: options = ((OptionHandler) m_Estimator).getOptions();
0426: else
0427: options = new String[0];
0428:
0429: if (options.length > 0) {
0430: result.add("--");
0431: for (i = 0; i < options.length; i++)
0432: result.add(options[i]);
0433: }
0434:
0435: return (String[]) result.toArray(new String[result.size()]);
0436: }
0437:
0438: /**
0439: * sets the PostProcessor to use
0440: *
0441: * @param value the new PostProcessor
0442: * @see #m_PostProcessor
0443: */
0444: public void setPostProcessor(PostProcessor value) {
0445: m_PostProcessor = value;
0446: }
0447:
0448: /**
0449: * returns the current PostProcessor, can be null
0450: *
0451: * @return the current PostProcessor
0452: */
0453: public PostProcessor getPostProcessor() {
0454: return m_PostProcessor;
0455: }
0456:
0457: /**
0458: * returns TRUE if the estimator returned a "not in classpath" Exception
0459: *
0460: * @return true if CLASSPATH problems occurred
0461: */
0462: public boolean hasClasspathProblems() {
0463: return m_ClasspathProblems;
0464: }
0465:
0466: /**
0467: * Begin the tests, reporting results to System.out
0468: */
0469: public void doTests() {
0470:
0471: if (getEstimator() == null) {
0472: println("\n=== No estimator set ===");
0473: return;
0474: }
0475: println("\n=== Check on Estimator: "
0476: + getEstimator().getClass().getName() + " ===\n");
0477:
0478: m_ClasspathProblems = false;
0479:
0480: // Start tests with test for options
0481: canTakeOptions();
0482:
0483: // test what type of estimator it is
0484: EstTypes estTypes = new EstTypes();
0485: estTypes.incremental = incrementalEstimator()[0];
0486: estTypes.weighted = weightedInstancesHandler()[0];
0487: estTypes.super vised = super visedEstimator()[0];
0488:
0489: // in none of the estimators yet the functionality is depending on the class type
0490: // since this could change the basic structure taken from checkclassifiers is kept here
0491: int classType = Attribute.NOMINAL;
0492: AttrTypes attrTypes = testsPerClassType(classType, estTypes);
0493:
0494: // only nominal class can be split up so far
0495: canSplitUpClass(attrTypes, classType);
0496: }
0497:
0498: /**
0499: * Set debugging mode
0500: *
0501: * @param debug true if debug output should be printed
0502: */
0503: public void setDebug(boolean debug) {
0504: m_Debug = debug;
0505:
0506: // disable silent mode, if necessary
0507: if (getDebug())
0508: setSilent(false);
0509: }
0510:
0511: /**
0512: * Get whether debugging is turned on
0513: *
0514: * @return true if debugging output is on
0515: */
0516: public boolean getDebug() {
0517: return m_Debug;
0518: }
0519:
0520: /**
0521: * Set slient mode, i.e., no output at all to stdout
0522: *
0523: * @param value whether silent mode is active or not
0524: */
0525: public void setSilent(boolean value) {
0526: m_Silent = value;
0527: }
0528:
0529: /**
0530: * Get whether silent mode is turned on
0531: *
0532: * @return true if silent mode is on
0533: */
0534: public boolean getSilent() {
0535: return m_Silent;
0536: }
0537:
0538: /**
0539: * Sets the number of instances to use in the datasets (some estimators
0540: * might require more instances).
0541: *
0542: * @param value the number of instances to use
0543: */
0544: public void setNumInstances(int value) {
0545: m_NumInstances = value;
0546: }
0547:
0548: /**
0549: * Gets the current number of instances to use for the datasets.
0550: *
0551: * @return the number of instances
0552: */
0553: public int getNumInstances() {
0554: return m_NumInstances;
0555: }
0556:
0557: /**
0558: * Set the estimator for boosting.
0559: *
0560: * @param newEstimator the Estimator to use.
0561: */
0562: public void setEstimator(Estimator newEstimator) {
0563: m_Estimator = newEstimator;
0564: }
0565:
0566: /**
0567: * Get the estimator used as the estimator
0568: *
0569: * @return the estimator used as the estimator
0570: */
0571: public Estimator getEstimator() {
0572: return m_Estimator;
0573: }
0574:
0575: /**
0576: * prints the given message to stdout, if not silent mode
0577: *
0578: * @param msg the text to print to stdout
0579: */
0580: protected void print(Object msg) {
0581: if (!getSilent())
0582: System.out.print(msg);
0583: }
0584:
0585: /**
0586: * prints the given message (+ LF) to stdout, if not silent mode
0587: *
0588: * @param msg the message to println to stdout
0589: */
0590: protected void println(Object msg) {
0591: print(msg + "\n");
0592: }
0593:
0594: /**
0595: * prints a LF to stdout, if not silent mode
0596: */
0597: protected void println() {
0598: print("\n");
0599: }
0600:
0601: /**
0602: * Run a battery of tests for a given class attribute type
0603: *
0604: * @param classType true if the class attribute should be numeric
0605: * @param estTypes types the estimator is, like incremental, weighted, supervised etc
0606: * @return attribute types estimator can work with
0607: */
0608: protected AttrTypes testsPerClassType(int classType,
0609: EstTypes estTypes) {
0610:
0611: // in none of the estimators yet is the estimation depending on the class type
0612: // since this could change the basic structure taken from checkclassifiers is kept here
0613:
0614: // test A: simple test - if can estimate
0615: AttrTypes attrTypes = new AttrTypes();
0616: AttrTypes at = new AttrTypes(Attribute.NOMINAL);
0617: attrTypes.nominal = canEstimate(at, estTypes.super vised,
0618: classType)[0];
0619: at = new AttrTypes(Attribute.NUMERIC);
0620: attrTypes.numeric = canEstimate(at, estTypes.super vised,
0621: classType)[0];
0622: attrTypes.string = false;
0623: attrTypes.date = false;
0624: attrTypes.relational = false;
0625:
0626: // if (!multiInstance)
0627: // PRel = canEstimate(false, false, false, false, true, classType)[0];
0628: // else
0629: // PRel = false;
0630:
0631: // one of the attribute types succeeded
0632:
0633: if (attrTypes.oneIsSet()) {
0634: Vector attributesSet = attrTypes.getVectorOfAttrTypes();
0635:
0636: // make tests for each attribute
0637: for (int i = 0; i < attributesSet.size(); i++) {
0638: AttrTypes workAttrTypes = new AttrTypes(
0639: ((Integer) attributesSet.elementAt(i))
0640: .intValue());
0641:
0642: // test B: weights change estimate or not
0643: if (estTypes.weighted)
0644: instanceWeights(workAttrTypes, classType);
0645:
0646: if (classType == Attribute.NOMINAL) {
0647: int numClasses = 4;
0648: canHandleNClasses(workAttrTypes, numClasses);
0649: }
0650:
0651: // tests with class not the last attribute and the attribute not the first
0652:
0653: // if (!multiInstance) {
0654: int numAtt = 4;
0655:
0656: canHandleClassAsNthAttribute(workAttrTypes, numAtt, 0,
0657: classType, 1);
0658:
0659: //TODOTODOcanHandleAttrAsNthAttribute(workAttrTypes, numAtt, 2, classType);
0660: //}
0661:
0662: canHandleZeroTraining(workAttrTypes, classType);
0663: boolean handleMissingAttributes = canHandleMissing(
0664: workAttrTypes, classType, true, false, 20)[0];
0665: if (handleMissingAttributes)
0666: canHandleMissing(workAttrTypes, classType, true,
0667: false, 100);
0668:
0669: boolean handleMissingClass = canHandleMissing(
0670: workAttrTypes, classType, false, true, 20)[0];
0671: if (handleMissingClass)
0672: canHandleMissing(workAttrTypes, classType, false,
0673: true, 100);
0674:
0675: correctBuildInitialisation(workAttrTypes, classType);
0676: datasetIntegrity(workAttrTypes, classType,
0677: handleMissingAttributes, handleMissingClass);
0678:
0679: if (estTypes.incremental)
0680: incrementingEquality(workAttrTypes, classType);
0681: }
0682: }
0683: return attrTypes;
0684: }
0685:
0686: /**
0687: * Checks whether the scheme can take command line options.
0688: *
0689: * @return index 0 is true if the estimator can take options
0690: */
0691: protected boolean[] canTakeOptions() {
0692:
0693: boolean[] result = new boolean[2];
0694:
0695: print("options...");
0696: if (m_Estimator instanceof OptionHandler) {
0697: println("yes");
0698: if (m_Debug) {
0699: println("\n=== Full report ===");
0700: Enumeration enu = ((OptionHandler) m_Estimator)
0701: .listOptions();
0702: while (enu.hasMoreElements()) {
0703: Option option = (Option) enu.nextElement();
0704: print(option.synopsis() + "\n"
0705: + option.description() + "\n");
0706: }
0707: println("\n");
0708: }
0709: result[0] = true;
0710: } else {
0711: println("no");
0712: result[0] = false;
0713: }
0714:
0715: return result;
0716: }
0717:
0718: /**
0719: * Checks whether the scheme can build models incrementally.
0720: *
0721: * @return index 0 is true if the estimator can train incrementally
0722: */
0723: protected boolean[] incrementalEstimator() {
0724:
0725: boolean[] result = new boolean[2];
0726:
0727: print("incremental estimator...");
0728: if (m_Estimator instanceof IncrementalEstimator) {
0729: println("yes");
0730: result[0] = true;
0731: } else {
0732: println("no");
0733: result[0] = false;
0734: }
0735:
0736: return result;
0737: }
0738:
0739: /**
0740: * Checks whether the scheme says it can handle instance weights.
0741: *
0742: * @return true if the estimator handles instance weights
0743: */
0744: protected boolean[] weightedInstancesHandler() {
0745:
0746: boolean[] result = new boolean[2];
0747:
0748: print("weighted instances estimator...");
0749: if (m_Estimator instanceof WeightedInstancesHandler) {
0750: println("yes");
0751: result[0] = true;
0752: } else {
0753: println("no");
0754: result[0] = false;
0755: }
0756:
0757: return result;
0758: }
0759:
0760: /**
0761: * Checks whether the estimator is supervised.
0762: *
0763: * @return true if the estimator handles instance weights
0764: */
0765: protected boolean[] super visedEstimator() {
0766: boolean[] result = new boolean[2];
0767: result[0] = false;
0768: return result;
0769: }
0770:
0771: /**
0772: * Checks basic estimation of one attribute of the scheme, for simple non-troublesome
0773: * datasets.
0774: *
0775: * @param attrTypes the types the estimator can work with
0776: * @param classType the class type (NOMINAL, NUMERIC, etc.)
0777: * @return index 0 is true if the test was passed, index 1 is true if test
0778: * was acceptable
0779: */
0780: protected boolean[] canEstimate(AttrTypes attrTypes,
0781: boolean super vised, int classType) {
0782:
0783: // supervised is ignored, no supervised estimators used yet
0784:
0785: print("basic estimation");
0786: printAttributeSummary(attrTypes, classType);
0787: print("...");
0788: FastVector accepts = new FastVector();
0789: accepts.addElement("nominal");
0790: accepts.addElement("numeric");
0791: accepts.addElement("string");
0792: accepts.addElement("date");
0793: accepts.addElement("relational");
0794: accepts.addElement("not in classpath");
0795: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
0796: boolean attributeMissing = false, classMissing = false;
0797: int numAtts = 1, attrIndex = 0;
0798:
0799: return runBasicTest(attrTypes, numAtts, attrIndex, classType,
0800: missingLevel, attributeMissing, classMissing, numTrain,
0801: numTest, numClasses, accepts);
0802: }
0803:
0804: /**
0805: * Checks basic estimation of one attribute of the scheme, for simple non-troublesome
0806: * datasets.
0807: *
0808: * @param attrTypes the types the estimator can work with
0809: * @param classType the class type (NOMINAL, NUMERIC, etc.)
0810: */
0811: protected void canSplitUpClass(AttrTypes attrTypes, int classType) {
0812:
0813: if (attrTypes.nominal)
0814: canSplitUpClass(Attribute.NOMINAL, classType);
0815: if (attrTypes.numeric)
0816: canSplitUpClass(Attribute.NUMERIC, classType);
0817: }
0818:
0819: /**
0820: * Checks basic estimation of one attribute of the scheme, for simple non-troublesome
0821: * datasets.
0822: *
0823: * @param attrType the type of the estimator
0824: * @param classType the class type (NOMINAL, NUMERIC, etc.)
0825: * @return index 0 is true if the test was passed, index 1 is true if test
0826: * was acceptable
0827: */
0828: protected boolean[] canSplitUpClass(int attrType, int classType) {
0829:
0830: boolean[] result = new boolean[2];
0831:
0832: FastVector accepts = new FastVector();
0833: accepts.addElement("not in classpath");
0834:
0835: // supervised is ignored, no supervised estimators used yet
0836: print("split per class type ");
0837: printAttributeSummary(attrType, Attribute.NOMINAL);
0838: print("...");
0839:
0840: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2;
0841: boolean attributeMissing = false, classMissing = false;
0842: int numAtts = 3, attrIndex = 0, classIndex = 1;
0843: Instances train = null;
0844: Vector test;
0845: Estimator estimator = null;
0846: boolean built = false;
0847:
0848: try {
0849: AttrTypes at = new AttrTypes(attrType);
0850: train = makeTestDataset(42, numTrain, numAtts, at,
0851: numClasses, classType, classIndex);
0852:
0853: // prepare training data set and test value list
0854: test = makeTestValueList(24, numTest, train, attrIndex,
0855: attrType);
0856:
0857: estimator = Estimator.makeCopies(getEstimator(), 1)[0];
0858: } catch (Exception ex) {
0859: ex.printStackTrace();
0860: throw new Error("Error setting up for tests: "
0861: + ex.getMessage());
0862: }
0863: try {
0864: estimator
0865: .addValues(train, attrIndex, classType, classIndex);
0866: built = true;
0867:
0868: testWithTestValues(estimator, test);
0869:
0870: println("yes");
0871: result[0] = true;
0872: } catch (Exception ex) {
0873: boolean acceptable = false;
0874: String msg;
0875: if (ex.getMessage() == null)
0876: msg = "";
0877: else
0878: msg = ex.getMessage().toLowerCase();
0879: if (msg.indexOf("not in classpath") > -1)
0880: m_ClasspathProblems = true;
0881:
0882: for (int i = 0; i < accepts.size(); i++) {
0883: if (msg.indexOf((String) accepts.elementAt(i)) >= 0) {
0884: acceptable = true;
0885: }
0886: }
0887:
0888: println("no" + (acceptable ? " (OK error message)" : ""));
0889: result[1] = acceptable;
0890:
0891: if (m_Debug) {
0892: println("\n=== Full Report ===");
0893: print("Problem during");
0894: if (built) {
0895: print(" testing");
0896: } else {
0897: print(" training");
0898: }
0899: println(": " + ex.getMessage() + "\n");
0900: if (!acceptable) {
0901: if (accepts.size() > 0) {
0902: print("Error message doesn't mention ");
0903: for (int i = 0; i < accepts.size(); i++) {
0904: if (i != 0) {
0905: print(" or ");
0906: }
0907: print('"' + (String) accepts.elementAt(i) + '"');
0908: }
0909: }
0910: println("here are the datasets:\n");
0911: println("=== Train Dataset ===\n"
0912: + train.toString() + "\n");
0913: println("=== Test Dataset ===\n" + test.toString()
0914: + "\n\n");
0915: }
0916:
0917: }
0918: }
0919: return result;
0920: }
0921:
0922: /**
0923: * Checks whether nominal schemes can handle more than two classes.
0924: * If a scheme is only designed for two-class problems it should
0925: * throw an appropriate exception for multi-class problems.
0926: *
0927: * @param attrTypes attribute types the estimator excepts
0928: * @param numClasses the number of classes to test
0929: * @return index 0 is true if the test was passed, index 1 is true if test
0930: * was acceptable
0931: */
0932: protected boolean[] canHandleNClasses(AttrTypes attrTypes,
0933: int numClasses) {
0934:
0935: print("more than two class problems");
0936: printAttributeSummary(attrTypes, Attribute.NOMINAL);
0937: print("...");
0938:
0939: FastVector accepts = new FastVector();
0940: accepts.addElement("number");
0941: accepts.addElement("class");
0942:
0943: int numTrain = getNumInstances(), numTest = getNumInstances(), missingLevel = 0;
0944: boolean attributeMissing = false, classMissing = false;
0945: int numAttr = 1, attrIndex = 0;
0946:
0947: return runBasicTest(attrTypes, numAttr, attrIndex,
0948: Attribute.NOMINAL, missingLevel, attributeMissing,
0949: classMissing, numTrain, numTest, numClasses, accepts);
0950: }
0951:
0952: /**
0953: * Checks whether the scheme can handle class attributes as Nth attribute.
0954: *
0955: * @param attrTypes the attribute types the estimator accepts
0956: * @param numAtts of attributes
0957: * @param attrIndex the index of the attribute
0958: * @param classType the class type (NUMERIC, NOMINAL, etc.)
0959: * @param classIndex the index of the class attribute (0-based, -1 means last attribute)
0960: * @return index 0 is true if the test was passed, index 1 is true if test
0961: * was acceptable
0962: * @see TestInstances#CLASS_IS_LAST
0963: */
0964: protected boolean[] canHandleClassAsNthAttribute(
0965: AttrTypes attrTypes, int numAtts, int attrIndex,
0966: int classType, int classIndex) {
0967:
0968: if (classIndex == TestInstances.CLASS_IS_LAST)
0969: print("class attribute as last attribute");
0970: else
0971: print("class attribute as " + (classIndex + 1)
0972: + ". attribute");
0973: printAttributeSummary(attrTypes, classType);
0974: print("...");
0975: FastVector accepts = new FastVector();
0976: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
0977: boolean attributeMissing = false, classMissing = false;
0978:
0979: return runBasicTest(attrTypes, numAtts, attrIndex, classType,
0980: classIndex, missingLevel, attributeMissing,
0981: classMissing, numTrain, numTest, numClasses, accepts);
0982: }
0983:
0984: /**
0985: * Checks whether the scheme can handle zero training instances.
0986: *
0987: * @param attrTypes attribute types that can be estimated
0988: * @param classType the class type (NUMERIC, NOMINAL, etc.)
0989: * @return index 0 is true if the test was passed, index 1 is true if test
0990: * was acceptable
0991: */
0992: protected boolean[] canHandleZeroTraining(AttrTypes attrTypes,
0993: int classType) {
0994:
0995: print("handle zero training instances");
0996: printAttributeSummary(attrTypes, classType);
0997:
0998: print("...");
0999: FastVector accepts = new FastVector();
1000: accepts.addElement("train");
1001: accepts.addElement("value");
1002: int numTrain = 0, numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
1003: boolean attributeMissing = false, classMissing = false;
1004: int numAtts = 1;
1005: int attrIndex = 0;
1006: return runBasicTest(attrTypes, numAtts, attrIndex, classType,
1007: missingLevel, attributeMissing, classMissing, numTrain,
1008: numTest, numClasses, accepts);
1009: }
1010:
1011: /**
1012: * Checks whether the scheme correctly initialises models when
1013: * buildEstimator is called. This test calls buildEstimator with
1014: * one training dataset and records performance on a test set.
1015: * buildEstimator is then called on a training set with different
1016: * structure, and then again with the original training set. The
1017: * performance on the test set is compared with the original results
1018: * and any performance difference noted as incorrect build initialisation.
1019: *
1020: * @param attrTypes attribute types that can be estimated
1021: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1022: * @return index 0 is true if the test was passed, index 1 is true if the
1023: * scheme performs worse than ZeroR, but without error (index 0 is
1024: * false)
1025: */
1026: protected boolean[] correctBuildInitialisation(AttrTypes attrTypes,
1027: int classType) {
1028:
1029: boolean[] result = new boolean[2];
1030:
1031: print("correct initialisation during buildEstimator");
1032: printAttributeSummary(attrTypes, classType);
1033:
1034: print("...");
1035: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
1036: boolean attributeMissing = false, classMissing = false;
1037:
1038: Instances train1 = null;
1039: Instances test1 = null;
1040: Instances train2 = null;
1041: Instances test2 = null;
1042: Estimator estimator = null;
1043: Estimator estimator1 = null;
1044:
1045: boolean built = false;
1046: int stage = 0;
1047: int attrIndex1 = 1;
1048: int attrIndex2 = 2;
1049:
1050: try {
1051:
1052: // Make two sets of train/test splits with different
1053: // numbers of attributes
1054: train1 = makeTestDataset(42, numTrain, 2, attrTypes,
1055: numClasses, classType);
1056: train2 = makeTestDataset(84, numTrain, 3, attrTypes,
1057: numClasses, classType);
1058: if (missingLevel > 0) {
1059: addMissing(train1, missingLevel, attributeMissing,
1060: classMissing, attrIndex1);
1061: addMissing(train2, missingLevel, attributeMissing,
1062: classMissing, attrIndex2);
1063: }
1064:
1065: estimator = Estimator.makeCopies(getEstimator(), 1)[0];
1066: } catch (Exception ex) {
1067: throw new Error("Error setting up for tests: "
1068: + ex.getMessage());
1069: }
1070: try {
1071: //TESTING??
1072: stage = 0;
1073: estimator.addValues(train1, attrIndex1);
1074: built = true;
1075:
1076: estimator1 = estimator.makeCopies(getEstimator(), 1)[0];
1077:
1078: stage = 1;
1079: built = false;
1080: estimator.addValues(train2, attrIndex2);
1081: built = true;
1082:
1083: stage = 2;
1084: built = false;
1085: estimator.addValues(train1, attrIndex1);
1086: built = true;
1087:
1088: stage = 3;
1089: if (!estimator.equals(estimator1)) {
1090: if (m_Debug) {
1091: println("\n=== Full report ===\n"
1092: + "\nFirst build estimator\n"
1093: + estimator.toString() + "\n\n");
1094: println("\nSecond build estimator\n"
1095: + estimator.toString() + "\n\n");
1096: }
1097: throw new Exception(
1098: "Results differ between buildEstimator calls");
1099: }
1100: println("yes");
1101: result[0] = true;
1102:
1103: if (false && m_Debug) {
1104: println("\n=== Full report ===\n"
1105: + "\nFirst buildEstimator()" + "\n\n");
1106: println("\nSecond buildEstimator()" + "\n\n");
1107: }
1108: } catch (Exception ex) {
1109: String msg = ex.getMessage().toLowerCase();
1110: if (msg.indexOf("worse than zeror") >= 0) {
1111: println("warning: performs worse than ZeroR");
1112: result[0] = true;
1113: result[1] = true;
1114: } else {
1115: println("no");
1116: result[0] = false;
1117: }
1118: if (m_Debug) {
1119: println("\n=== Full Report ===");
1120: print("Problem during");
1121: if (built) {
1122: print(" testing");
1123: } else {
1124: print(" training");
1125: }
1126: switch (stage) {
1127: case 0:
1128: print(" of dataset 1");
1129: break;
1130: case 1:
1131: print(" of dataset 2");
1132: break;
1133: case 2:
1134: print(" of dataset 1 (2nd build)");
1135: break;
1136: case 3:
1137: print(", comparing results from builds of dataset 1");
1138: break;
1139: }
1140: println(": " + ex.getMessage() + "\n");
1141: println("here are the datasets:\n");
1142: println("=== Train1 Dataset ===\n" + train1.toString()
1143: + "\n");
1144: println("=== Test1 Dataset ===\n" + test1.toString()
1145: + "\n\n");
1146: println("=== Train2 Dataset ===\n" + train2.toString()
1147: + "\n");
1148: println("=== Test2 Dataset ===\n" + test2.toString()
1149: + "\n\n");
1150: }
1151: }
1152:
1153: return result;
1154: }
1155:
1156: /**
1157: * Checks basic missing value handling of the scheme. If the missing
1158: * values cause an exception to be thrown by the scheme, this will be
1159: * recorded.
1160: *
1161: * @param attrTypes attribute types that can be estimated
1162: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1163: * @param attributeMissing true if the missing values may be in
1164: * the attributes
1165: * @param classMissing true if the missing values may be in the class
1166: * @param missingLevel the percentage of missing values
1167: * @return index 0 is true if the test was passed, index 1 is true if test
1168: * was acceptable
1169: */
1170: protected boolean[] canHandleMissing(AttrTypes attrTypes,
1171: int classType, boolean attributeMissing,
1172: boolean classMissing, int missingLevel) {
1173:
1174: if (missingLevel == 100)
1175: print("100% ");
1176: print("missing");
1177: if (attributeMissing) {
1178: print(" attribute");
1179: if (classMissing)
1180: print(" and");
1181: }
1182: if (classMissing)
1183: print(" class");
1184: print(" values");
1185: printAttributeSummary(attrTypes, classType);
1186:
1187: print("...");
1188: FastVector accepts = new FastVector();
1189: accepts.addElement("missing");
1190: accepts.addElement("value");
1191: accepts.addElement("train");
1192: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2;
1193:
1194: int numAtts = 1, attrIndex = 0;
1195: return runBasicTest(attrTypes, numAtts, attrIndex, classType,
1196: missingLevel, attributeMissing, classMissing, numTrain,
1197: numTest, numClasses, accepts);
1198: }
1199:
1200: /**
1201: * Checks whether an incremental scheme produces the same model when
1202: * trained incrementally as when batch trained. The model itself
1203: * cannot be compared, so we compare the evaluation on test data
1204: * for both models. It is possible to get a false positive on this
1205: * test (likelihood depends on the estimator).
1206: *
1207: * @param attrTypes attribute types that can be estimated
1208: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1209: * @return index 0 is true if the test was passed
1210: */
1211: protected boolean[] incrementingEquality(AttrTypes attrTypes,
1212: int classType) {
1213:
1214: print("incremental training produces the same results"
1215: + " as batch training");
1216: printAttributeSummary(attrTypes, classType);
1217:
1218: print("...");
1219: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
1220: boolean attributeMissing = false, classMissing = false;
1221:
1222: boolean[] result = new boolean[2];
1223: Instances train = null;
1224: Estimator[] estimators = null;
1225: boolean built = false;
1226: int attrIndex = 0;
1227: Vector test;
1228: try {
1229: train = makeTestDataset(42, numTrain, 1, attrTypes,
1230: numClasses, classType);
1231:
1232: // prepare training data set and test value list
1233: test = makeTestValueList(24, numTest, train, attrIndex,
1234: attrTypes.getSetType());
1235:
1236: if (missingLevel > 0) {
1237: addMissing(train, missingLevel, attributeMissing,
1238: classMissing, attrIndex);
1239: }
1240: estimators = Estimator.makeCopies(getEstimator(), 2);
1241: estimators[0].addValues(train, attrIndex);
1242: } catch (Exception ex) {
1243: throw new Error("Error setting up for tests: "
1244: + ex.getMessage());
1245: }
1246: try {
1247: for (int i = 0; i < train.numInstances(); i++) {
1248: ((IncrementalEstimator) estimators[1]).addValue(train
1249: .instance(i).value(attrIndex), 1.0);
1250: }
1251: built = true;
1252: if (!estimators[0].equals(estimators[1])) {
1253: println("no");
1254: result[0] = false;
1255:
1256: if (m_Debug) {
1257: println("\n=== Full Report ===");
1258: println("Results differ between batch and "
1259: + "incrementally built models.\n"
1260: + "Depending on the estimator, this may be OK");
1261: println("Here are the results:\n");
1262: println("batch built results\n"
1263: + estimators[0].toString());
1264: println("incrementally built results\n"
1265: + estimators[1].toString());
1266: println("Here are the datasets:\n");
1267: println("=== Train Dataset ===\n"
1268: + train.toString() + "\n");
1269: println("=== Test Dataset ===\n" + test.toString()
1270: + "\n\n");
1271: }
1272: } else {
1273: println("yes");
1274: result[0] = true;
1275: }
1276: } catch (Exception ex) {
1277: result[0] = false;
1278:
1279: print("Problem during");
1280: if (built)
1281: print(" testing");
1282: else
1283: print(" training");
1284: println(": " + ex.getMessage() + "\n");
1285: }
1286:
1287: return result;
1288: }
1289:
1290: /**
1291: * Checks whether the estimator can handle instance weights.
1292: * This test compares the estimator performance on two datasets
1293: * that are identical except for the training weights. If the
1294: * results change, then the estimator must be using the weights. It
1295: * may be possible to get a false positive from this test if the
1296: * weight changes aren't significant enough to induce a change
1297: * in estimator performance (but the weights are chosen to minimize
1298: * the likelihood of this).
1299: *
1300: * @param attrTypes attribute types that can be estimated
1301: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1302: * @return index 0 true if the test was passed
1303: */
1304: protected boolean[] instanceWeights(AttrTypes attrTypes,
1305: int classType) {
1306:
1307: print("estimator uses instance weights");
1308: printAttributeSummary(attrTypes, classType);
1309:
1310: print("...");
1311:
1312: int numTrain = 2 * getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
1313: boolean attributeMissing = false, classMissing = false;
1314:
1315: boolean[] result = new boolean[2];
1316: Instances train = null;
1317: Vector test = null;
1318: Estimator[] estimators = null;
1319:
1320: Vector resultProbsO = null;
1321: Vector resultProbsW = null;
1322: boolean built = false;
1323: boolean evalFail = false;
1324: int attrIndex = 0;
1325: try {
1326: train = makeTestDataset(42, numTrain, 1, attrTypes,
1327: numClasses, classType);
1328:
1329: // prepare training data set and test value list
1330: test = makeTestValueList(24, numTest, train, attrIndex,
1331: attrTypes.getSetType());
1332:
1333: if (missingLevel > 0) {
1334: addMissing(train, missingLevel, attributeMissing,
1335: classMissing, attrIndex);
1336: }
1337:
1338: estimators = Estimator.makeCopies(getEstimator(), 2);
1339:
1340: estimators[0].addValues(train, attrIndex);
1341: resultProbsO = testWithTestValues(estimators[0], test);
1342:
1343: } catch (Exception ex) {
1344: throw new Error("Error setting up for tests: "
1345: + ex.getMessage());
1346: }
1347: try {
1348:
1349: // Now modify instance weights and re-built
1350: for (int i = 0; i < train.numInstances(); i++) {
1351: train.instance(i).setWeight(0);
1352: }
1353: Random random = new Random(1);
1354: for (int i = 0; i < train.numInstances() / 2; i++) {
1355: int inst = Math.abs(random.nextInt())
1356: % train.numInstances();
1357: int weight = Math.abs(random.nextInt()) % 10 + 1;
1358: train.instance(inst).setWeight(weight);
1359: }
1360: estimators[1].addValues(train, attrIndex);
1361: resultProbsW = testWithTestValues(estimators[1], test);
1362:
1363: built = true;
1364: if (resultProbsO.equals(resultProbsW)) {
1365: // println("no");
1366: evalFail = true;
1367: throw new Exception("evalFail");
1368: }
1369:
1370: println("yes");
1371: result[0] = true;
1372: } catch (Exception ex) {
1373: println("no");
1374: result[0] = false;
1375:
1376: if (m_Debug) {
1377: println("\n=== Full Report ===");
1378:
1379: if (evalFail) {
1380: println("Results don't differ between non-weighted and "
1381: + "weighted instance models.");
1382: println("Here are the results:\n");
1383: println(probsToString(resultProbsO));
1384: } else {
1385: print("Problem during");
1386: if (built) {
1387: print(" testing");
1388: } else {
1389: print(" training");
1390: }
1391: println(": " + ex.getMessage() + "\n");
1392: }
1393: println("Here are the datasets:\n");
1394: println("=== Train Dataset ===\n" + train.toString()
1395: + "\n");
1396: println("=== Train Weights ===\n");
1397: for (int i = 0; i < train.numInstances(); i++) {
1398: println(" " + (i + 1) + " "
1399: + train.instance(i).weight());
1400: }
1401: println("=== Test Dataset ===\n" + test.toString()
1402: + "\n\n");
1403: println("(test weights all 1.0\n");
1404: }
1405: }
1406:
1407: return result;
1408: }
1409:
1410: /**
1411: * Checks whether the scheme alters the training dataset during
1412: * training. If the scheme needs to modify the training
1413: * data it should take a copy of the training data. Currently checks
1414: * for changes to header structure, number of instances, order of
1415: * instances, instance weights.
1416: *
1417: * @param attrTypes attribute types that can be estimated
1418: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1419: * @param attributeMissing true if we know the estimator can handle
1420: * (at least) moderate missing attribute values
1421: * @param classMissing true if we know the estimator can handle
1422: * (at least) moderate missing class values
1423: * @return index 0 is true if the test was passed
1424: */
1425: protected boolean[] datasetIntegrity(AttrTypes attrTypes,
1426: int classType, boolean attributeMissing,
1427: boolean classMissing) {
1428:
1429: Estimator estimator = null;
1430: print("estimator doesn't alter original datasets");
1431: printAttributeSummary(attrTypes, classType);
1432: print("...");
1433: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 100;
1434:
1435: boolean[] result = new boolean[2];
1436: Instances train = null;
1437: boolean built = false;
1438: try {
1439: train = makeTestDataset(42, numTrain, 1, attrTypes,
1440: numClasses, classType);
1441: int attrIndex = 0;
1442:
1443: if (missingLevel > 0) {
1444: addMissing(train, missingLevel, attributeMissing,
1445: classMissing, attrIndex);
1446: }
1447: estimator = Estimator.makeCopies(getEstimator(), 1)[0];
1448: } catch (Exception ex) {
1449: throw new Error("Error setting up for tests: "
1450: + ex.getMessage());
1451: }
1452: try {
1453: Instances trainCopy = new Instances(train);
1454: int attrIndex = 0;
1455: estimator.addValues(trainCopy, attrIndex);
1456: compareDatasets(train, trainCopy);
1457: built = true;
1458:
1459: println("yes");
1460: result[0] = true;
1461: } catch (Exception ex) {
1462: println("no");
1463: result[0] = false;
1464:
1465: if (m_Debug) {
1466: println("\n=== Full Report ===");
1467: print("Problem during");
1468: if (built) {
1469: print(" testing");
1470: } else {
1471: print(" training");
1472: }
1473: println(": " + ex.getMessage() + "\n");
1474: println("Here are the datasets:\n");
1475: println("=== Train Dataset ===\n" + train.toString()
1476: + "\n");
1477: }
1478: }
1479:
1480: return result;
1481: }
1482:
1483: /**
1484: * Runs a text on the datasets with the given characteristics.
1485: *
1486: * @param attrTypes attribute types that can be estimated
1487: * @param numAtts number of attributes
1488: * @param attrIndex attribute index
1489: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1490: * @param missingLevel the percentage of missing values
1491: * @param attributeMissing true if the missing values may be in
1492: * the attributes
1493: * @param classMissing true if the missing values may be in the class
1494: * @param numTrain the number of instances in the training set
1495: * @param numTest the number of instaces in the test set
1496: * @param numClasses the number of classes
1497: * @param accepts the acceptable string in an exception
1498: * @return index 0 is true if the test was passed, index 1 is true if test
1499: * was acceptable
1500: */
1501: protected boolean[] runBasicTest(AttrTypes attrTypes, int numAtts,
1502: int attrIndex, int classType, int missingLevel,
1503: boolean attributeMissing, boolean classMissing,
1504: int numTrain, int numTest, int numClasses,
1505: FastVector accepts) {
1506:
1507: return runBasicTest(attrTypes, numAtts, attrIndex, classType,
1508: TestInstances.CLASS_IS_LAST, missingLevel,
1509: attributeMissing, classMissing, numTrain, numTest,
1510: numClasses, accepts);
1511: }
1512:
1513: /**
1514: * Runs a text on the datasets with the given characteristics.
1515: *
1516: * @param attrTypes attribute types that can be estimated
1517: * @param numAtts number of attributes
1518: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1519: * @param classIndex the attribute index of the class
1520: * @param missingLevel the percentage of missing values
1521: * @param attributeMissing true if the missing values may be in
1522: * the attributes
1523: * @param classMissing true if the missing values may be in the class
1524: * @param numTrain the number of instances in the training set
1525: * @param numTest the number of instaces in the test set
1526: * @param numClasses the number of classes
1527: * @param accepts the acceptable string in an exception
1528: * @return index 0 is true if the test was passed, index 1 is true if test
1529: * was acceptable
1530: */
1531: protected boolean[] runBasicTest(AttrTypes attrTypes, int numAtts,
1532: int attrIndex, int classType, int classIndex,
1533: int missingLevel, boolean attributeMissing,
1534: boolean classMissing, int numTrain, int numTest,
1535: int numClasses, FastVector accepts) {
1536:
1537: boolean[] result = new boolean[2];
1538: Instances train = null;
1539: Vector test = null;
1540: Estimator estimator = null;
1541: boolean built = false;
1542:
1543: try {
1544: train = makeTestDataset(42, numTrain, numAtts, attrTypes,
1545: numClasses, classType, classIndex);
1546:
1547: // prepare training data set and test value list
1548: if (numTrain > 0) {
1549: test = makeTestValueList(24, numTest, train, attrIndex,
1550: attrTypes.getSetType());
1551:
1552: } else {
1553: double min = -10.0;
1554: double max = 8.0;
1555: test = makeTestValueList(24, numTest, min, max,
1556: attrTypes.getSetType());
1557: }
1558:
1559: if (missingLevel > 0) {
1560: addMissing(train, missingLevel, attributeMissing,
1561: classMissing, attrIndex);
1562: }
1563: estimator = Estimator.makeCopies(getEstimator(), 1)[0];
1564: } catch (Exception ex) {
1565: ex.printStackTrace();
1566: throw new Error("Error setting up for tests: "
1567: + ex.getMessage());
1568: }
1569: try {
1570: estimator.addValues(train, attrIndex);
1571: built = true;
1572:
1573: testWithTestValues(estimator, test);
1574:
1575: println("yes");
1576: result[0] = true;
1577: } catch (Exception ex) {
1578: boolean acceptable = false;
1579: String msg;
1580: if (ex.getMessage() == null)
1581: msg = "";
1582: else
1583: msg = ex.getMessage().toLowerCase();
1584: if (msg.indexOf("not in classpath") > -1)
1585: m_ClasspathProblems = true;
1586:
1587: for (int i = 0; i < accepts.size(); i++) {
1588: if (msg.indexOf((String) accepts.elementAt(i)) >= 0) {
1589: acceptable = true;
1590: }
1591: }
1592:
1593: println("no" + (acceptable ? " (OK error message)" : ""));
1594: result[1] = acceptable;
1595:
1596: if (m_Debug) {
1597: println("\n=== Full Report ===");
1598: print("Problem during");
1599: if (built) {
1600: print(" testing");
1601: } else {
1602: print(" training");
1603: }
1604: println(": " + ex.getMessage() + "\n");
1605: if (!acceptable) {
1606: if (accepts.size() > 0) {
1607: print("Error message doesn't mention ");
1608: for (int i = 0; i < accepts.size(); i++) {
1609: if (i != 0) {
1610: print(" or ");
1611: }
1612: print('"' + (String) accepts.elementAt(i) + '"');
1613: }
1614: }
1615: println("here are the datasets:\n");
1616: println("=== Train Dataset ===\n"
1617: + train.toString() + "\n");
1618: println("=== Test Dataset ===\n" + test.toString()
1619: + "\n\n");
1620: }
1621:
1622: }
1623: }
1624: return result;
1625: }
1626:
1627: /**
1628: * Compare two datasets to see if they differ.
1629: *
1630: * @param data1 one set of instances
1631: * @param data2 the other set of instances
1632: * @throws Exception if the datasets differ
1633: */
1634: protected void compareDatasets(Instances data1, Instances data2)
1635: throws Exception {
1636: if (!data2.equalHeaders(data1)) {
1637: throw new Exception("header has been modified");
1638: }
1639: if (!(data2.numInstances() == data1.numInstances())) {
1640: throw new Exception("number of instances has changed");
1641: }
1642: for (int i = 0; i < data2.numInstances(); i++) {
1643: Instance orig = data1.instance(i);
1644: Instance copy = data2.instance(i);
1645: for (int j = 0; j < orig.numAttributes(); j++) {
1646: if (orig.isMissing(j)) {
1647: if (!copy.isMissing(j)) {
1648: throw new Exception("instances have changed");
1649: }
1650: } else if (orig.value(j) != copy.value(j)) {
1651: throw new Exception("instances have changed");
1652: }
1653: if (orig.weight() != copy.weight()) {
1654: throw new Exception("instance weights have changed");
1655: }
1656: }
1657: }
1658: }
1659:
1660: /**
1661: * Add missing values to a dataset.
1662: *
1663: * @param data the instances to add missing values to
1664: * @param level the level of missing values to add (if positive, this
1665: * is the probability that a value will be set to missing, if negative
1666: * all but one value will be set to missing (not yet implemented))
1667: * @param attributeMissing if true, attributes will be modified
1668: * @param classMissing if true, the class attribute will be modified
1669: * @param attrIndex index of the attribute
1670: */
1671: protected void addMissing(Instances data, int level,
1672: boolean attributeMissing, boolean classMissing,
1673: int attrIndex) {
1674:
1675: int classIndex = data.classIndex();
1676: Random random = new Random(1);
1677: for (int i = 0; i < data.numInstances(); i++) {
1678: Instance current = data.instance(i);
1679:
1680: for (int j = 0; j < data.numAttributes(); j++) {
1681: if (((j == classIndex) && classMissing)
1682: || ((j == attrIndex) && attributeMissing)) {
1683: if (Math.abs(random.nextInt()) % 100 < level)
1684: current.setMissing(j);
1685: }
1686: }
1687: }
1688: }
1689:
1690: /**
1691: * Make a simple set of instances, which can later be modified
1692: * for use in specific tests.
1693: *
1694: * @param seed the random number seed
1695: * @param numInstances the number of instances to generate
1696: * @param numAttr the number of attributes
1697: * @param attrTypes the attribute types
1698: * @param numClasses the number of classes (if nominal class)
1699: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1700: * @return the test dataset
1701: * @throws Exception if the dataset couldn't be generated
1702: * @see #process(Instances)
1703: */
1704: protected Instances makeTestDataset(int seed, int numInstances,
1705: int numAttr, AttrTypes attrTypes, int numClasses,
1706: int classType) throws Exception {
1707:
1708: return makeTestDataset(seed, numInstances, numAttr, attrTypes,
1709: numClasses, classType, TestInstances.CLASS_IS_LAST);
1710: }
1711:
1712: /**
1713: * Make a simple set of instances with variable position of the class
1714: * attribute, which can later be modified for use in specific tests.
1715: *
1716: * @param seed the random number seed
1717: * @param numInstances the number of instances to generate
1718: * @param numAttr the number of attributes to generate
1719: * @param attrTypes the type of attrbute that is excepted
1720: * @param numClasses the number of classes (if nominal class)
1721: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1722: * @param classIndex the index of the class (0-based, -1 as last)
1723: * @return the test dataset
1724: * @throws Exception if the dataset couldn't be generated
1725: * @see TestInstances#CLASS_IS_LAST
1726: * @see #process(Instances)
1727: */
1728: protected Instances makeTestDataset(int seed, int numInstances,
1729: int numAttr, AttrTypes attrTypes, int numClasses,
1730: int classType, int classIndex) throws Exception {
1731:
1732: TestInstances dataset = new TestInstances();
1733:
1734: dataset.setSeed(seed);
1735: dataset.setNumInstances(numInstances);
1736: dataset.setNumNominal(attrTypes.nominal ? numAttr : 0);
1737: dataset.setNumNumeric(attrTypes.numeric ? numAttr : 0);
1738: dataset.setNumString(attrTypes.string ? numAttr : 0);
1739: dataset.setNumDate(attrTypes.date ? numAttr : 0);
1740: dataset.setNumRelational(attrTypes.relational ? numAttr : 0);
1741: dataset.setNumClasses(numClasses);
1742: dataset.setClassType(classType);
1743: dataset.setClassIndex(classIndex);
1744:
1745: return process(dataset.generate());
1746: }
1747:
1748: /**
1749: * Make a simple set of values. Only one of the num'type' parameters should be larger 0.
1750: * (just to make parameter similar to the makeTestDataset parameters)
1751: *
1752: * @param seed the random number seed
1753: * @param numValues the number of values to generate
1754: * @param data the dataset to make test examples for
1755: * @param attrIndex index of the attribute
1756: * @param attrType the class type (NUMERIC, NOMINAL, etc.)
1757: * @throws Exception if the dataset couldn't be generated
1758: * @see #process(Instances)
1759: */
1760: protected Vector makeTestValueList(int seed, int numValues,
1761: Instances data, int attrIndex, int attrType)
1762: throws Exception {
1763:
1764: // get min max
1765: double[] minMax = getMinimumMaximum(data, attrIndex);
1766: double minValue = minMax[0];
1767: double maxValue = minMax[1];
1768:
1769: // make value list and put into a VECTOR
1770: double range = maxValue - minValue;
1771: Vector values = new Vector(numValues);
1772: Random random = new Random(seed);
1773:
1774: if (attrType == Attribute.NOMINAL) {
1775: for (int i = 0; i < numValues; i++) {
1776: Double v = new Double(
1777: (Math.abs(random.nextInt()) % (int) range)
1778: + (int) minValue);
1779: values.add(v);
1780: }
1781: }
1782: if (attrType == Attribute.NUMERIC) {
1783: for (int i = 0; i < numValues; i++) {
1784: Double v = new Double(random.nextDouble() * range
1785: + minValue);
1786: values.add(v);
1787: }
1788: }
1789: return values;
1790: }
1791:
1792: /**
1793: * Make a simple set of values. Only one of the num'type' parameters should be larger 0.
1794: * (just to make parameter similar to the makeTestDataset parameters)
1795: *
1796: * @param seed the random number seed
1797: * @param numValues the number of values to generate
1798: * @param minValue the minimal data value
1799: * @param maxValue the maximal data value
1800: * @param attrType the class type (NUMERIC, NOMINAL, etc.)
1801: * @throws Exception if the dataset couldn't be generated
1802: * @see #process(Instances)
1803: */
1804: protected Vector makeTestValueList(int seed, int numValues,
1805: double minValue, double maxValue, int attrType)
1806: throws Exception {
1807:
1808: // make value list and put into a VECTOR
1809: double range = maxValue - minValue;
1810: Vector values = new Vector(numValues);
1811: Random random = new Random(seed);
1812:
1813: if (attrType == Attribute.NOMINAL) {
1814: for (int i = 0; i < numValues; i++) {
1815: Double v = new Double(
1816: (Math.abs(random.nextInt()) % (int) range)
1817: + (int) minValue);
1818: values.add(v);
1819: }
1820: }
1821: if (attrType == Attribute.NUMERIC) {
1822: for (int i = 0; i < numValues; i++) {
1823: Double v = new Double(random.nextDouble() * range
1824: + minValue);
1825: values.add(v);
1826: }
1827: }
1828: return values;
1829: }
1830:
1831: /**
1832: * Test with test values.
1833: *
1834: * @param est estimator to be tested
1835: * @param test vector with test values
1836: *
1837: **/
1838: protected Vector testWithTestValues(Estimator est, Vector test) {
1839:
1840: Vector results = new Vector();
1841: for (int i = 0; i < test.size(); i++) {
1842: double testValue = ((Double) (test.elementAt(i)))
1843: .doubleValue();
1844: double prob = est.getProbability(testValue);
1845: Double p = new Double(prob);
1846: results.add(p);
1847: }
1848: return results;
1849: }
1850:
1851: /**
1852: * Gets the minimum and maximum of the values a the first attribute
1853: * of the given data set
1854: *
1855: * @param inst the instance
1856: * @param attrIndex the index of the attribut to find min and max
1857: * @return the array with the minimum value on index 0 and the max on index 1
1858: */
1859:
1860: protected double[] getMinimumMaximum(Instances inst, int attrIndex) {
1861: double[] minMax = new double[2];
1862:
1863: try {
1864: int num = getMinMax(inst, attrIndex, minMax);
1865: } catch (Exception ex) {
1866: ex.printStackTrace();
1867: System.out.println(ex.getMessage());
1868: }
1869: return minMax;
1870: // double minValue = minMax[0];
1871: // double maxValue = minMax[1];
1872: }
1873:
1874: /**
1875: * Find the minimum and the maximum of the attribute and return it in
1876: * the last parameter..
1877: * @param inst instances used to build the estimator
1878: * @param attrIndex index of the attribute
1879: * @param minMax the array to return minimum and maximum in
1880: * @return number of not missing values
1881: * @exception Exception if parameter minMax wasn't initialized properly
1882: */
1883: public static int getMinMax(Instances inst, int attrIndex,
1884: double[] minMax) throws Exception {
1885: double min = Double.NaN;
1886: double max = Double.NaN;
1887: Instance instance = null;
1888: int numNotMissing = 0;
1889: if ((minMax == null) || (minMax.length < 2)) {
1890: throw new Exception(
1891: "Error in Program, privat method getMinMax");
1892: }
1893:
1894: Enumeration enumInst = inst.enumerateInstances();
1895: if (enumInst.hasMoreElements()) {
1896: do {
1897: instance = (Instance) enumInst.nextElement();
1898: } while (instance.isMissing(attrIndex)
1899: && (enumInst.hasMoreElements()));
1900:
1901: // add values if not missing
1902: if (!instance.isMissing(attrIndex)) {
1903: numNotMissing++;
1904: min = instance.value(attrIndex);
1905: max = instance.value(attrIndex);
1906: }
1907: while (enumInst.hasMoreElements()) {
1908: instance = (Instance) enumInst.nextElement();
1909: if (!instance.isMissing(attrIndex)) {
1910: numNotMissing++;
1911: if (instance.value(attrIndex) < min) {
1912: min = (instance.value(attrIndex));
1913: } else {
1914: if (instance.value(attrIndex) > max) {
1915: max = (instance.value(attrIndex));
1916: }
1917: }
1918: }
1919: }
1920: }
1921: minMax[0] = min;
1922: minMax[1] = max;
1923: return numNotMissing;
1924: }
1925:
1926: /**
1927: * Print the probabilities after testing
1928: * @param probs vector with probability values
1929: * @return string with probability values printed
1930: */
1931: private String probsToString(Vector probs) {
1932: StringBuffer txt = new StringBuffer(" ");
1933: for (int i = 0; i < probs.size(); i++) {
1934: txt.append(""
1935: + ((Double) (probs.elementAt(i))).doubleValue()
1936: + " ");
1937: }
1938: return txt.toString();
1939: }
1940:
1941: /**
1942: * Provides a hook for derived classes to further modify the data.
1943: *
1944: * @param data the data to process
1945: * @return the processed data
1946: * @see #m_PostProcessor
1947: */
1948: protected Instances process(Instances data) {
1949: if (getPostProcessor() == null)
1950: return data;
1951: else
1952: return getPostProcessor().process(data);
1953: }
1954:
1955: /**
1956: * Print out a short summary string for the dataset characteristics
1957: *
1958: * @param attrTypes the attribute types used (NUMERIC, NOMINAL, etc.)
1959: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1960: */
1961: protected void printAttributeSummary(AttrTypes attrTypes,
1962: int classType) {
1963:
1964: String str = "";
1965:
1966: if (attrTypes.numeric)
1967: str += " numeric";
1968:
1969: if (attrTypes.nominal) {
1970: if (str.length() > 0)
1971: str += " &";
1972: str += " nominal";
1973: }
1974:
1975: if (attrTypes.string) {
1976: if (str.length() > 0)
1977: str += " &";
1978: str += " string";
1979: }
1980:
1981: if (attrTypes.date) {
1982: if (str.length() > 0)
1983: str += " &";
1984: str += " date";
1985: }
1986:
1987: if (attrTypes.relational) {
1988: if (str.length() > 0)
1989: str += " &";
1990: str += " relational";
1991: }
1992:
1993: str += " attributes)";
1994:
1995: switch (classType) {
1996: case Attribute.NUMERIC:
1997: str = " (numeric class," + str;
1998: break;
1999: case Attribute.NOMINAL:
2000: str = " (nominal class," + str;
2001: break;
2002: case Attribute.STRING:
2003: str = " (string class," + str;
2004: break;
2005: case Attribute.DATE:
2006: str = " (date class," + str;
2007: break;
2008: case Attribute.RELATIONAL:
2009: str = " (relational class," + str;
2010: break;
2011: }
2012:
2013: print(str);
2014: }
2015:
2016: /**
2017: * Print out a short summary string for the dataset characteristics
2018: *
2019: * @param attrType the attribute type (NUMERIC, NOMINAL, etc.)
2020: * @param classType the class type (NUMERIC, NOMINAL, etc.)
2021: */
2022: protected void printAttributeSummary(int attrType, int classType) {
2023:
2024: String str = "";
2025:
2026: switch (attrType) {
2027: case Attribute.NUMERIC:
2028: str = " numeric" + str;
2029: break;
2030: case Attribute.NOMINAL:
2031: str = " nominal" + str;
2032: break;
2033: case Attribute.STRING:
2034: str = " string" + str;
2035: break;
2036: case Attribute.DATE:
2037: str = " date" + str;
2038: break;
2039: case Attribute.RELATIONAL:
2040: str = " relational" + str;
2041: break;
2042: }
2043: str += " attribute(s))";
2044:
2045: switch (classType) {
2046: case Attribute.NUMERIC:
2047: str = " (numeric class," + str;
2048: break;
2049: case Attribute.NOMINAL:
2050: str = " (nominal class," + str;
2051: break;
2052: case Attribute.STRING:
2053: str = " (string class," + str;
2054: break;
2055: case Attribute.DATE:
2056: str = " (date class," + str;
2057: break;
2058: case Attribute.RELATIONAL:
2059: str = " (relational class," + str;
2060: break;
2061: }
2062:
2063: print(str);
2064: }
2065:
2066: /**
2067: * Test method for this class
2068: *
2069: * @param args the commandline parameters
2070: */
2071: public static void main(String[] args) {
2072: try {
2073: CheckEstimator check = new CheckEstimator();
2074:
2075: try {
2076: check.setOptions(args);
2077: Utils.checkForRemainingOptions(args);
2078: } catch (Exception ex) {
2079: String result = ex.getMessage()
2080: + "\n\n"
2081: + check.getClass().getName().replaceAll(
2082: ".*\\.", "") + " Options:\n\n";
2083: Enumeration enu = check.listOptions();
2084: while (enu.hasMoreElements()) {
2085: Option option = (Option) enu.nextElement();
2086: result += option.synopsis() + "\n"
2087: + option.description() + "\n";
2088: }
2089: throw new Exception(result);
2090: }
2091:
2092: check.doTests();
2093: } catch (Exception ex) {
2094: System.err.println(ex.getMessage());
2095: }
2096: }
2097: }
|