001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * CheckScheme.java
019: * Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.core;
024:
025: import java.util.Enumeration;
026: import java.util.Random;
027: import java.util.StringTokenizer;
028: import java.util.Vector;
029:
030: /**
031: * Abstract general class for testing schemes in Weka. Derived classes are
032: * also used for JUnit tests.
033: *
034: * @author FracPete (fracpete at waikato dot ac dot nz)
035: * @version $Revision: 1.3 $
036: * @see TestInstances
037: */
038: public abstract class CheckScheme extends Check {
039:
040: /** a class for postprocessing the test-data */
041: public static class PostProcessor {
042: /**
043: * Provides a hook for derived classes to further modify the data. Currently,
044: * the data is just passed through.
045: *
046: * @param data the data to process
047: * @return the processed data
048: */
049: public Instances process(Instances data) {
050: return data;
051: }
052: }
053:
054: /** The number of instances in the datasets */
055: protected int m_NumInstances = 20;
056:
057: /** the number of nominal attributes */
058: protected int m_NumNominal = 2;
059:
060: /** the number of numeric attributes */
061: protected int m_NumNumeric = 1;
062:
063: /** the number of string attributes */
064: protected int m_NumString = 1;
065:
066: /** the number of date attributes */
067: protected int m_NumDate = 1;
068:
069: /** the number of relational attributes */
070: protected int m_NumRelational = 1;
071:
072: /** the number of instances in relational attributes (applies also for bags
073: * in multi-instance) */
074: protected int m_NumInstancesRelational = 10;
075:
076: /** for generating String attributes/classes */
077: protected String[] m_Words = TestInstances.DEFAULT_WORDS;
078:
079: /** for generating String attributes/classes */
080: protected String m_WordSeparators = TestInstances.DEFAULT_SEPARATORS;
081:
082: /** for post-processing the data even further */
083: protected PostProcessor m_PostProcessor = null;
084:
085: /** whether classpath problems occurred */
086: protected boolean m_ClasspathProblems = false;
087:
088: /**
089: * Returns an enumeration describing the available options.
090: *
091: * @return an enumeration of all the available options.
092: */
093: public Enumeration listOptions() {
094: Vector result = new Vector();
095:
096: Enumeration en = super .listOptions();
097: while (en.hasMoreElements())
098: result.addElement(en.nextElement());
099:
100: result
101: .addElement(new Option(
102: "\tThe number of instances in the datasets (default 20).",
103: "N", 1, "-N <num>"));
104:
105: result.addElement(new Option(
106: "\tThe number of nominal attributes (default 2).",
107: "nominal", 1, "-nominal <num>"));
108:
109: result
110: .addElement(new Option(
111: "\tThe number of values for nominal attributes (default 1).",
112: "nominal-values", 1, "-nominal-values <num>"));
113:
114: result.addElement(new Option(
115: "\tThe number of numeric attributes (default 1).",
116: "numeric", 1, "-numeric <num>"));
117:
118: result.addElement(new Option(
119: "\tThe number of string attributes (default 1).",
120: "string", 1, "-string <num>"));
121:
122: result.addElement(new Option(
123: "\tThe number of date attributes (default 1).", "date",
124: 1, "-date <num>"));
125:
126: result.addElement(new Option(
127: "\tThe number of relational attributes (default 1).",
128: "relational", 1, "-relational <num>"));
129:
130: result
131: .addElement(new Option(
132: "\tThe number of instances in relational/bag attributes (default 10).",
133: "num-instances-relational", 1,
134: "-num-instances-relational <num>"));
135:
136: result.addElement(new Option(
137: "\tThe words to use in string attributes.", "words", 1,
138: "-words <comma-separated-list>"));
139:
140: result.addElement(new Option(
141: "\tThe word separators to use in string attributes.",
142: "word-separators", 1, "-word-separators <chars>"));
143:
144: return result.elements();
145: }
146:
147: /**
148: * Parses a given list of options.
149: *
150: * @param options the list of options as an array of strings
151: * @throws Exception if an option is not supported
152: */
153: public void setOptions(String[] options) throws Exception {
154: String tmpStr;
155:
156: super .setOptions(options);
157:
158: tmpStr = Utils.getOption('N', options);
159: if (tmpStr.length() != 0)
160: setNumInstances(Integer.parseInt(tmpStr));
161: else
162: setNumInstances(20);
163:
164: tmpStr = Utils.getOption("nominal", options);
165: if (tmpStr.length() != 0)
166: setNumNominal(Integer.parseInt(tmpStr));
167: else
168: setNumNominal(2);
169:
170: tmpStr = Utils.getOption("numeric", options);
171: if (tmpStr.length() != 0)
172: setNumNumeric(Integer.parseInt(tmpStr));
173: else
174: setNumNumeric(1);
175:
176: tmpStr = Utils.getOption("string", options);
177: if (tmpStr.length() != 0)
178: setNumString(Integer.parseInt(tmpStr));
179: else
180: setNumString(1);
181:
182: tmpStr = Utils.getOption("date", options);
183: if (tmpStr.length() != 0)
184: setNumDate(Integer.parseInt(tmpStr));
185: else
186: setNumDate(1);
187:
188: tmpStr = Utils.getOption("relational", options);
189: if (tmpStr.length() != 0)
190: setNumRelational(Integer.parseInt(tmpStr));
191: else
192: setNumRelational(1);
193:
194: tmpStr = Utils.getOption("num-instances-relational", options);
195: if (tmpStr.length() != 0)
196: setNumInstancesRelational(Integer.parseInt(tmpStr));
197: else
198: setNumInstancesRelational(10);
199:
200: tmpStr = Utils.getOption("words", options);
201: if (tmpStr.length() != 0)
202: setWords(tmpStr);
203: else
204: setWords(new TestInstances().getWords());
205:
206: if (Utils.getOptionPos("word-separators", options) > -1) {
207: tmpStr = Utils.getOption("word-separators", options);
208: setWordSeparators(tmpStr);
209: } else {
210: setWordSeparators(TestInstances.DEFAULT_SEPARATORS);
211: }
212: }
213:
214: /**
215: * Gets the current settings of the CheckClassifier.
216: *
217: * @return an array of strings suitable for passing to setOptions
218: */
219: public String[] getOptions() {
220: Vector result;
221: String[] options;
222: int i;
223:
224: result = new Vector();
225:
226: options = super .getOptions();
227: for (i = 0; i < options.length; i++)
228: result.add(options[i]);
229:
230: result.add("-N");
231: result.add("" + getNumInstances());
232:
233: result.add("-nominal");
234: result.add("" + getNumNominal());
235:
236: result.add("-numeric");
237: result.add("" + getNumNumeric());
238:
239: result.add("-string");
240: result.add("" + getNumString());
241:
242: result.add("-date");
243: result.add("" + getNumDate());
244:
245: result.add("-relational");
246: result.add("" + getNumRelational());
247:
248: result.add("-words");
249: result.add("" + getWords());
250:
251: result.add("-word-separators");
252: result.add("" + getWordSeparators());
253:
254: return (String[]) result.toArray(new String[result.size()]);
255: }
256:
257: /**
258: * sets the PostProcessor to use
259: *
260: * @param value the new PostProcessor
261: * @see #m_PostProcessor
262: */
263: public void setPostProcessor(PostProcessor value) {
264: m_PostProcessor = value;
265: }
266:
267: /**
268: * returns the current PostProcessor, can be null
269: *
270: * @return the current PostProcessor
271: */
272: public PostProcessor getPostProcessor() {
273: return m_PostProcessor;
274: }
275:
276: /**
277: * returns TRUE if the classifier returned a "not in classpath" Exception
278: *
279: * @return true if CLASSPATH problems occurred
280: */
281: public boolean hasClasspathProblems() {
282: return m_ClasspathProblems;
283: }
284:
285: /**
286: * Begin the tests, reporting results to System.out
287: */
288: public abstract void doTests();
289:
290: /**
291: * Sets the number of instances to use in the datasets (some classifiers
292: * might require more instances).
293: *
294: * @param value the number of instances to use
295: */
296: public void setNumInstances(int value) {
297: m_NumInstances = value;
298: }
299:
300: /**
301: * Gets the current number of instances to use for the datasets.
302: *
303: * @return the number of instances
304: */
305: public int getNumInstances() {
306: return m_NumInstances;
307: }
308:
309: /**
310: * sets the number of nominal attributes
311: *
312: * @param value the number of nominal attributes
313: */
314: public void setNumNominal(int value) {
315: m_NumNominal = value;
316: }
317:
318: /**
319: * returns the current number of nominal attributes
320: *
321: * @return the number of nominal attributes
322: */
323: public int getNumNominal() {
324: return m_NumNominal;
325: }
326:
327: /**
328: * sets the number of numeric attributes
329: *
330: * @param value the number of numeric attributes
331: */
332: public void setNumNumeric(int value) {
333: m_NumNumeric = value;
334: }
335:
336: /**
337: * returns the current number of numeric attributes
338: *
339: * @return the number of numeric attributes
340: */
341: public int getNumNumeric() {
342: return m_NumNumeric;
343: }
344:
345: /**
346: * sets the number of string attributes
347: *
348: * @param value the number of string attributes
349: */
350: public void setNumString(int value) {
351: m_NumString = value;
352: }
353:
354: /**
355: * returns the current number of string attributes
356: *
357: * @return the number of string attributes
358: */
359: public int getNumString() {
360: return m_NumString;
361: }
362:
363: /**
364: * sets the number of data attributes
365: *
366: * @param value the number of date attributes
367: */
368: public void setNumDate(int value) {
369: m_NumDate = value;
370: }
371:
372: /**
373: * returns the current number of date attributes
374: *
375: * @return the number of date attributes
376: */
377: public int getNumDate() {
378: return m_NumDate;
379: }
380:
381: /**
382: * sets the number of relational attributes
383: *
384: * @param value the number of relational attributes
385: */
386: public void setNumRelational(int value) {
387: m_NumRelational = value;
388: }
389:
390: /**
391: * returns the current number of relational attributes
392: *
393: * @return the number of relational attributes
394: */
395: public int getNumRelational() {
396: return m_NumRelational;
397: }
398:
399: /**
400: * sets the number of instances in relational/bag attributes to produce
401: *
402: * @param value the number of instances
403: */
404: public void setNumInstancesRelational(int value) {
405: m_NumInstancesRelational = value;
406: }
407:
408: /**
409: * returns the current number of instances in relational/bag attributes to produce
410: *
411: * @return the number of instances
412: */
413: public int getNumInstancesRelational() {
414: return m_NumInstancesRelational;
415: }
416:
417: /**
418: * turns the comma-separated list into an array
419: *
420: * @param value the list to process
421: * @return the list as array
422: */
423: protected static String[] listToArray(String value) {
424: StringTokenizer tok;
425: Vector list;
426:
427: list = new Vector();
428: tok = new StringTokenizer(value, ",");
429: while (tok.hasMoreTokens())
430: list.add(tok.nextToken());
431:
432: return (String[]) list.toArray(new String[list.size()]);
433: }
434:
435: /**
436: * turns the array into a comma-separated list
437: *
438: * @param value the array to process
439: * @return the array as list
440: */
441: protected static String arrayToList(String[] value) {
442: String result;
443: int i;
444:
445: result = "";
446:
447: for (i = 0; i < value.length; i++) {
448: if (i > 0)
449: result += ",";
450: result += value[i];
451: }
452:
453: return result;
454: }
455:
456: /**
457: * returns a string representation of the attribute type
458: *
459: * @param type the attribute type to get a string rerpresentation for
460: * @return the string representation
461: */
462: public static String attributeTypeToString(int type) {
463: String result;
464:
465: switch (type) {
466: case Attribute.NUMERIC:
467: result = "numeric";
468: break;
469:
470: case Attribute.NOMINAL:
471: result = "nominal";
472: break;
473:
474: case Attribute.STRING:
475: result = "string";
476: break;
477:
478: case Attribute.DATE:
479: result = "date";
480: break;
481:
482: case Attribute.RELATIONAL:
483: result = "relational";
484: break;
485:
486: default:
487: result = "???";
488: }
489:
490: return result;
491: }
492:
493: /**
494: * Sets the comma-separated list of words to use for generating strings. The
495: * list must contain at least 2 words, otherwise an exception will be thrown.
496: *
497: * @param value the list of words
498: * @throws IllegalArgumentException if not at least 2 words are provided
499: */
500: public void setWords(String value) {
501: if (listToArray(value).length < 2)
502: throw new IllegalArgumentException(
503: "At least 2 words must be provided!");
504:
505: m_Words = listToArray(value);
506: }
507:
508: /**
509: * returns the words used for assembling strings in a comma-separated list.
510: *
511: * @return the words as comma-separated list
512: */
513: public String getWords() {
514: return arrayToList(m_Words);
515: }
516:
517: /**
518: * sets the word separators (chars) to use for assembling strings.
519: *
520: * @param value the characters to use as separators
521: */
522: public void setWordSeparators(String value) {
523: m_WordSeparators = value;
524: }
525:
526: /**
527: * returns the word separators (chars) to use for assembling strings.
528: *
529: * @return the current separators
530: */
531: public String getWordSeparators() {
532: return m_WordSeparators;
533: }
534:
535: /**
536: * Compare two datasets to see if they differ.
537: *
538: * @param data1 one set of instances
539: * @param data2 the other set of instances
540: * @throws Exception if the datasets differ
541: */
542: protected void compareDatasets(Instances data1, Instances data2)
543: throws Exception {
544:
545: if (!data2.equalHeaders(data1)) {
546: throw new Exception("header has been modified");
547: }
548: if (!(data2.numInstances() == data1.numInstances())) {
549: throw new Exception("number of instances has changed");
550: }
551: for (int i = 0; i < data2.numInstances(); i++) {
552: Instance orig = data1.instance(i);
553: Instance copy = data2.instance(i);
554: for (int j = 0; j < orig.numAttributes(); j++) {
555: if (orig.isMissing(j)) {
556: if (!copy.isMissing(j)) {
557: throw new Exception("instances have changed");
558: }
559: } else if (orig.value(j) != copy.value(j)) {
560: throw new Exception("instances have changed");
561: }
562: if (orig.weight() != copy.weight()) {
563: throw new Exception("instance weights have changed");
564: }
565: }
566: }
567: }
568:
569: /**
570: * Add missing values to a dataset.
571: *
572: * @param data the instances to add missing values to
573: * @param level the level of missing values to add (if positive, this
574: * is the probability that a value will be set to missing, if negative
575: * all but one value will be set to missing (not yet implemented))
576: * @param predictorMissing if true, predictor attributes will be modified
577: * @param classMissing if true, the class attribute will be modified
578: */
579: protected void addMissing(Instances data, int level,
580: boolean predictorMissing, boolean classMissing) {
581:
582: int classIndex = data.classIndex();
583: Random random = new Random(1);
584: for (int i = 0; i < data.numInstances(); i++) {
585: Instance current = data.instance(i);
586: for (int j = 0; j < data.numAttributes(); j++) {
587: if (((j == classIndex) && classMissing)
588: || ((j != classIndex) && predictorMissing)) {
589: if (Math.abs(random.nextInt()) % 100 < level)
590: current.setMissing(j);
591: }
592: }
593: }
594: }
595:
596: /**
597: * Provides a hook for derived classes to further modify the data.
598: *
599: * @param data the data to process
600: * @return the processed data
601: * @see #m_PostProcessor
602: */
603: protected Instances process(Instances data) {
604: if (getPostProcessor() == null)
605: return data;
606: else
607: return getPostProcessor().process(data);
608: }
609: }
|