0001: /*
0002: * This program is free software; you can redistribute it and/or modify
0003: * it under the terms of the GNU General Public License as published by
0004: * the Free Software Foundation; either version 2 of the License, or
0005: * (at your option) any later version.
0006: *
0007: * This program is distributed in the hope that it will be useful,
0008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0010: * GNU General Public License for more details.
0011: *
0012: * You should have received a copy of the GNU General Public License
0013: * along with this program; if not, write to the Free Software
0014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0015: */
0016:
0017: /*
0018: * Instances.java
0019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
0020: *
0021: */
0022:
0023: package weka.core;
0024:
0025: import weka.core.converters.ArffLoader.ArffReader;
0026: import weka.core.converters.ConverterUtils.DataSource;
0027:
0028: import java.io.FileReader;
0029: import java.io.IOException;
0030: import java.io.Reader;
0031: import java.io.Serializable;
0032: import java.util.Enumeration;
0033: import java.util.Random;
0034:
0035: /**
0036: * Class for handling an ordered set of weighted instances. <p>
0037: *
0038: * Typical usage: <p>
0039: * <pre>
0040: * import weka.core.converters.ConverterUtils.DataSource;
0041: * ...
0042: *
0043: * // Read all the instances in the file (ARFF, CSV, XRFF, ...)
0044: * DataSource source = new DataSource(filename);
0045: * Instances instances = source.getDataSet();
0046: *
0047: * // Make the last attribute be the class
0048: * instances.setClassIndex(instances.numAttributes() - 1);
0049: *
0050: * // Print header and instances.
0051: * System.out.println("\nDataset:\n");
0052: * System.out.println(instances);
0053: *
0054: * ...
0055: * </pre><p>
0056: *
0057: * All methods that change a set of instances are safe, ie. a change
0058: * of a set of instances does not affect any other sets of
0059: * instances. All methods that change a datasets's attribute
0060: * information clone the dataset before it is changed.
0061: *
0062: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
0063: * @author Len Trigg (trigg@cs.waikato.ac.nz)
0064: * @author FracPete (fracpete at waikato dot ac dot nz)
0065: * @version $Revision: 1.72 $
0066: */
0067: public class Instances implements Serializable {
0068:
0069: /** for serialization */
0070: static final long serialVersionUID = -19412345060742748L;
0071:
0072: /** The filename extension that should be used for arff files */
0073: public final static String FILE_EXTENSION = ".arff";
0074:
0075: /** The filename extension that should be used for bin. serialized instances files */
0076: public final static String SERIALIZED_OBJ_FILE_EXTENSION = ".bsi";
0077:
0078: /** The keyword used to denote the start of an arff header */
0079: public final static String ARFF_RELATION = "@relation";
0080:
0081: /** The keyword used to denote the start of the arff data section */
0082: public final static String ARFF_DATA = "@data";
0083:
0084: /** The dataset's name. */
0085: protected/*@spec_public non_null@*/String m_RelationName;
0086:
0087: /** The attribute information. */
0088: protected/*@spec_public non_null@*/FastVector m_Attributes;
0089: /* public invariant (\forall int i; 0 <= i && i < m_Attributes.size();
0090: m_Attributes.elementAt(i) != null);
0091: */
0092:
0093: /** The instances. */
0094: protected/*@spec_public non_null@*/FastVector m_Instances;
0095:
0096: /** The class attribute's index */
0097: protected int m_ClassIndex;
0098: //@ protected invariant classIndex() == m_ClassIndex;
0099:
0100: /** The lines read so far in case of incremental loading. Since the
0101: * StreamTokenizer will be re-initialized with every instance that is read,
0102: * we have to keep track of the number of lines read so far.
0103: * @see #readInstance(Reader) */
0104: protected int m_Lines = 0;
0105:
0106: /**
0107: * Reads an ARFF file from a reader, and assigns a weight of
0108: * one to each instance. Lets the index of the class
0109: * attribute be undefined (negative).
0110: *
0111: * @param reader the reader
0112: * @throws IOException if the ARFF file is not read
0113: * successfully
0114: */
0115: public Instances(/*@non_null@*/Reader reader) throws IOException {
0116: ArffReader arff = new ArffReader(reader);
0117: Instances dataset = arff.getData();
0118: initialize(dataset, dataset.numInstances());
0119: dataset.copyInstances(0, this , dataset.numInstances());
0120: compactify();
0121: }
0122:
0123: /**
0124: * Reads the header of an ARFF file from a reader and
0125: * reserves space for the given number of instances. Lets
0126: * the class index be undefined (negative).
0127: *
0128: * @param reader the reader
0129: * @param capacity the capacity
0130: * @throws IllegalArgumentException if the header is not read successfully
0131: * or the capacity is negative.
0132: * @throws IOException if there is a problem with the reader.
0133: * @deprecated instead of using this method in conjunction with the
0134: * <code>readInstance(Reader)</code> method, one should use the
0135: * <code>ArffLoader</code> or <code>DataSource</code> class instead.
0136: * @see weka.core.converters.ArffLoader
0137: * @see weka.core.converters.ConverterUtils.DataSource
0138: */
0139: //@ requires capacity >= 0;
0140: //@ ensures classIndex() == -1;
0141: @Deprecated
0142: public Instances(/*@non_null@*/Reader reader, int capacity)
0143: throws IOException {
0144:
0145: ArffReader arff = new ArffReader(reader, 0);
0146: Instances header = arff.getStructure();
0147: initialize(header, capacity);
0148: m_Lines = arff.getLineNo();
0149: }
0150:
0151: /**
0152: * Constructor copying all instances and references to
0153: * the header information from the given set of instances.
0154: *
0155: * @param dataset the set to be copied
0156: */
0157: public Instances(/*@non_null@*/Instances dataset) {
0158:
0159: this (dataset, dataset.numInstances());
0160:
0161: dataset.copyInstances(0, this , dataset.numInstances());
0162: }
0163:
0164: /**
0165: * Constructor creating an empty set of instances. Copies references
0166: * to the header information from the given set of instances. Sets
0167: * the capacity of the set of instances to 0 if its negative.
0168: *
0169: * @param dataset the instances from which the header
0170: * information is to be taken
0171: * @param capacity the capacity of the new dataset
0172: */
0173: public Instances(/*@non_null@*/Instances dataset, int capacity) {
0174: initialize(dataset, capacity);
0175: }
0176:
0177: /**
0178: * initializes with the header information of the given dataset and sets
0179: * the capacity of the set of instances.
0180: *
0181: * @param dataset the dataset to use as template
0182: * @param capacity the number of rows to reserve
0183: */
0184: protected void initialize(Instances dataset, int capacity) {
0185: if (capacity < 0)
0186: capacity = 0;
0187:
0188: // Strings only have to be "shallow" copied because
0189: // they can't be modified.
0190: m_ClassIndex = dataset.m_ClassIndex;
0191: m_RelationName = dataset.m_RelationName;
0192: m_Attributes = dataset.m_Attributes;
0193: m_Instances = new FastVector(capacity);
0194: }
0195:
0196: /**
0197: * Creates a new set of instances by copying a
0198: * subset of another set.
0199: *
0200: * @param source the set of instances from which a subset
0201: * is to be created
0202: * @param first the index of the first instance to be copied
0203: * @param toCopy the number of instances to be copied
0204: * @throws IllegalArgumentException if first and toCopy are out of range
0205: */
0206: //@ requires 0 <= first;
0207: //@ requires 0 <= toCopy;
0208: //@ requires first + toCopy <= source.numInstances();
0209: public Instances(/*@non_null@*/Instances source, int first,
0210: int toCopy) {
0211:
0212: this (source, toCopy);
0213:
0214: if ((first < 0) || ((first + toCopy) > source.numInstances())) {
0215: throw new IllegalArgumentException(
0216: "Parameters first and/or toCopy out " + "of range");
0217: }
0218: source.copyInstances(first, this , toCopy);
0219: }
0220:
0221: /**
0222: * Creates an empty set of instances. Uses the given
0223: * attribute information. Sets the capacity of the set of
0224: * instances to 0 if its negative. Given attribute information
0225: * must not be changed after this constructor has been used.
0226: *
0227: * @param name the name of the relation
0228: * @param attInfo the attribute information
0229: * @param capacity the capacity of the set
0230: */
0231: public Instances(/*@non_null@*/String name,
0232: /*@non_null@*/FastVector attInfo, int capacity) {
0233:
0234: m_RelationName = name;
0235: m_ClassIndex = -1;
0236: m_Attributes = attInfo;
0237: for (int i = 0; i < numAttributes(); i++) {
0238: attribute(i).setIndex(i);
0239: }
0240: m_Instances = new FastVector(capacity);
0241: }
0242:
0243: /**
0244: * Create a copy of the structure, but "cleanse" string types (i.e.
0245: * doesn't contain references to the strings seen in the past).
0246: * Also cleanses all relational attributes.
0247: *
0248: * @return a copy of the instance structure.
0249: */
0250: public Instances stringFreeStructure() {
0251:
0252: FastVector atts = (FastVector) m_Attributes.copy();
0253: for (int i = 0; i < atts.size(); i++) {
0254: Attribute att = (Attribute) atts.elementAt(i);
0255: if (att.type() == Attribute.STRING) {
0256: atts.setElementAt(new Attribute(att.name(),
0257: (FastVector) null), i);
0258: } else if (att.type() == Attribute.RELATIONAL) {
0259: atts.setElementAt(new Attribute(att.name(),
0260: new Instances(att.relation(), 0)), i);
0261: }
0262: }
0263: Instances result = new Instances(relationName(), atts, 0);
0264: result.m_ClassIndex = m_ClassIndex;
0265: return result;
0266: }
0267:
0268: /**
0269: * Adds one instance to the end of the set.
0270: * Shallow copies instance before it is added. Increases the
0271: * size of the dataset if it is not large enough. Does not
0272: * check if the instance is compatible with the dataset.
0273: * Note: String or relational values are not transferred.
0274: *
0275: * @param instance the instance to be added
0276: */
0277: public void add(/*@non_null@*/Instance instance) {
0278:
0279: Instance newInstance = (Instance) instance.copy();
0280:
0281: newInstance.setDataset(this );
0282: m_Instances.addElement(newInstance);
0283: }
0284:
0285: /**
0286: * Returns an attribute.
0287: *
0288: * @param index the attribute's index (index starts with 0)
0289: * @return the attribute at the given position
0290: */
0291: //@ requires 0 <= index;
0292: //@ requires index < m_Attributes.size();
0293: //@ ensures \result != null;
0294: public/*@pure@*/Attribute attribute(int index) {
0295:
0296: return (Attribute) m_Attributes.elementAt(index);
0297: }
0298:
0299: /**
0300: * Returns an attribute given its name. If there is more than
0301: * one attribute with the same name, it returns the first one.
0302: * Returns null if the attribute can't be found.
0303: *
0304: * @param name the attribute's name
0305: * @return the attribute with the given name, null if the
0306: * attribute can't be found
0307: */
0308: public/*@pure@*/Attribute attribute(String name) {
0309:
0310: for (int i = 0; i < numAttributes(); i++) {
0311: if (attribute(i).name().equals(name)) {
0312: return attribute(i);
0313: }
0314: }
0315: return null;
0316: }
0317:
0318: /**
0319: * Checks for attributes of the given type in the dataset
0320: *
0321: * @param attType the attribute type to look for
0322: * @return true if attributes of the given type are present
0323: */
0324: public boolean checkForAttributeType(int attType) {
0325:
0326: int i = 0;
0327:
0328: while (i < m_Attributes.size()) {
0329: if (attribute(i++).type() == attType) {
0330: return true;
0331: }
0332: }
0333: return false;
0334: }
0335:
0336: /**
0337: * Checks for string attributes in the dataset
0338: *
0339: * @return true if string attributes are present, false otherwise
0340: */
0341: public/*@pure@*/boolean checkForStringAttributes() {
0342: return checkForAttributeType(Attribute.STRING);
0343: }
0344:
0345: /**
0346: * Checks if the given instance is compatible
0347: * with this dataset. Only looks at the size of
0348: * the instance and the ranges of the values for
0349: * nominal and string attributes.
0350: *
0351: * @param instance the instance to check
0352: * @return true if the instance is compatible with the dataset
0353: */
0354: public/*@pure@*/boolean checkInstance(Instance instance) {
0355:
0356: if (instance.numAttributes() != numAttributes()) {
0357: return false;
0358: }
0359: for (int i = 0; i < numAttributes(); i++) {
0360: if (instance.isMissing(i)) {
0361: continue;
0362: } else if (attribute(i).isNominal()
0363: || attribute(i).isString()) {
0364: if (!(Utils.eq(instance.value(i),
0365: (double) (int) instance.value(i)))) {
0366: return false;
0367: } else if (Utils.sm(instance.value(i), 0)
0368: || Utils.gr(instance.value(i), attribute(i)
0369: .numValues())) {
0370: return false;
0371: }
0372: }
0373: }
0374: return true;
0375: }
0376:
0377: /**
0378: * Returns the class attribute.
0379: *
0380: * @return the class attribute
0381: * @throws UnassignedClassException if the class is not set
0382: */
0383: //@ requires classIndex() >= 0;
0384: public/*@pure@*/Attribute classAttribute() {
0385:
0386: if (m_ClassIndex < 0) {
0387: throw new UnassignedClassException(
0388: "Class index is negative (not set)!");
0389: }
0390: return attribute(m_ClassIndex);
0391: }
0392:
0393: /**
0394: * Returns the class attribute's index. Returns negative number
0395: * if it's undefined.
0396: *
0397: * @return the class index as an integer
0398: */
0399: // ensures \result == m_ClassIndex;
0400: public/*@pure@*/int classIndex() {
0401:
0402: return m_ClassIndex;
0403: }
0404:
0405: /**
0406: * Compactifies the set of instances. Decreases the capacity of
0407: * the set so that it matches the number of instances in the set.
0408: */
0409: public void compactify() {
0410:
0411: m_Instances.trimToSize();
0412: }
0413:
0414: /**
0415: * Removes all instances from the set.
0416: */
0417: public void delete() {
0418:
0419: m_Instances = new FastVector();
0420: }
0421:
0422: /**
0423: * Removes an instance at the given position from the set.
0424: *
0425: * @param index the instance's position (index starts with 0)
0426: */
0427: //@ requires 0 <= index && index < numInstances();
0428: public void delete(int index) {
0429:
0430: m_Instances.removeElementAt(index);
0431: }
0432:
0433: /**
0434: * Deletes an attribute at the given position
0435: * (0 to numAttributes() - 1). A deep copy of the attribute
0436: * information is performed before the attribute is deleted.
0437: *
0438: * @param position the attribute's position (position starts with 0)
0439: * @throws IllegalArgumentException if the given index is out of range
0440: * or the class attribute is being deleted
0441: */
0442: //@ requires 0 <= position && position < numAttributes();
0443: //@ requires position != classIndex();
0444: public void deleteAttributeAt(int position) {
0445:
0446: if ((position < 0) || (position >= m_Attributes.size())) {
0447: throw new IllegalArgumentException("Index out of range");
0448: }
0449: if (position == m_ClassIndex) {
0450: throw new IllegalArgumentException(
0451: "Can't delete class attribute");
0452: }
0453: freshAttributeInfo();
0454: if (m_ClassIndex > position) {
0455: m_ClassIndex--;
0456: }
0457: m_Attributes.removeElementAt(position);
0458: for (int i = position; i < m_Attributes.size(); i++) {
0459: Attribute current = (Attribute) m_Attributes.elementAt(i);
0460: current.setIndex(current.index() - 1);
0461: }
0462: for (int i = 0; i < numInstances(); i++) {
0463: instance(i).forceDeleteAttributeAt(position);
0464: }
0465: }
0466:
0467: /**
0468: * Deletes all attributes of the given type in the dataset. A deep copy of
0469: * the attribute information is performed before an attribute is deleted.
0470: *
0471: * @param attType the attribute type to delete
0472: * @throws IllegalArgumentException if attribute couldn't be
0473: * successfully deleted (probably because it is the class attribute).
0474: */
0475: public void deleteAttributeType(int attType) {
0476: int i = 0;
0477: while (i < m_Attributes.size()) {
0478: if (attribute(i).type() == attType) {
0479: deleteAttributeAt(i);
0480: } else {
0481: i++;
0482: }
0483: }
0484: }
0485:
0486: /**
0487: * Deletes all string attributes in the dataset. A deep copy of the attribute
0488: * information is performed before an attribute is deleted.
0489: *
0490: * @throws IllegalArgumentException if string attribute couldn't be
0491: * successfully deleted (probably because it is the class attribute).
0492: * @see #deleteAttributeType(int)
0493: */
0494: public void deleteStringAttributes() {
0495: deleteAttributeType(Attribute.STRING);
0496: }
0497:
0498: /**
0499: * Removes all instances with missing values for a particular
0500: * attribute from the dataset.
0501: *
0502: * @param attIndex the attribute's index (index starts with 0)
0503: */
0504: //@ requires 0 <= attIndex && attIndex < numAttributes();
0505: public void deleteWithMissing(int attIndex) {
0506:
0507: FastVector newInstances = new FastVector(numInstances());
0508:
0509: for (int i = 0; i < numInstances(); i++) {
0510: if (!instance(i).isMissing(attIndex)) {
0511: newInstances.addElement(instance(i));
0512: }
0513: }
0514: m_Instances = newInstances;
0515: }
0516:
0517: /**
0518: * Removes all instances with missing values for a particular
0519: * attribute from the dataset.
0520: *
0521: * @param att the attribute
0522: */
0523: public void deleteWithMissing(/*@non_null@*/Attribute att) {
0524:
0525: deleteWithMissing(att.index());
0526: }
0527:
0528: /**
0529: * Removes all instances with a missing class value
0530: * from the dataset.
0531: *
0532: * @throws UnassignedClassException if class is not set
0533: */
0534: public void deleteWithMissingClass() {
0535:
0536: if (m_ClassIndex < 0) {
0537: throw new UnassignedClassException(
0538: "Class index is negative (not set)!");
0539: }
0540: deleteWithMissing(m_ClassIndex);
0541: }
0542:
0543: /**
0544: * Returns an enumeration of all the attributes.
0545: *
0546: * @return enumeration of all the attributes.
0547: */
0548: public/*@non_null pure@*/Enumeration enumerateAttributes() {
0549:
0550: return m_Attributes.elements(m_ClassIndex);
0551: }
0552:
0553: /**
0554: * Returns an enumeration of all instances in the dataset.
0555: *
0556: * @return enumeration of all instances in the dataset
0557: */
0558: public/*@non_null pure@*/Enumeration enumerateInstances() {
0559:
0560: return m_Instances.elements();
0561: }
0562:
0563: /**
0564: * Checks if two headers are equivalent.
0565: *
0566: * @param dataset another dataset
0567: * @return true if the header of the given dataset is equivalent
0568: * to this header
0569: */
0570: public/*@pure@*/boolean equalHeaders(Instances dataset) {
0571:
0572: // Check class and all attributes
0573: if (m_ClassIndex != dataset.m_ClassIndex) {
0574: return false;
0575: }
0576: if (m_Attributes.size() != dataset.m_Attributes.size()) {
0577: return false;
0578: }
0579: for (int i = 0; i < m_Attributes.size(); i++) {
0580: if (!(attribute(i).equals(dataset.attribute(i)))) {
0581: return false;
0582: }
0583: }
0584: return true;
0585: }
0586:
0587: /**
0588: * Returns the first instance in the set.
0589: *
0590: * @return the first instance in the set
0591: */
0592: //@ requires numInstances() > 0;
0593: public/*@non_null pure@*/Instance firstInstance() {
0594:
0595: return (Instance) m_Instances.firstElement();
0596: }
0597:
0598: /**
0599: * Returns a random number generator. The initial seed of the random
0600: * number generator depends on the given seed and the hash code of
0601: * a string representation of a instances chosen based on the given
0602: * seed.
0603: *
0604: * @param seed the given seed
0605: * @return the random number generator
0606: */
0607: public Random getRandomNumberGenerator(long seed) {
0608:
0609: Random r = new Random(seed);
0610: r.setSeed(instance(r.nextInt(numInstances())).toString()
0611: .hashCode()
0612: + seed);
0613: return r;
0614: }
0615:
0616: /**
0617: * Inserts an attribute at the given position (0 to
0618: * numAttributes()) and sets all values to be missing.
0619: * Shallow copies the attribute before it is inserted, and performs
0620: * a deep copy of the existing attribute information.
0621: *
0622: * @param att the attribute to be inserted
0623: * @param position the attribute's position (position starts with 0)
0624: * @throws IllegalArgumentException if the given index is out of range
0625: */
0626: //@ requires 0 <= position;
0627: //@ requires position <= numAttributes();
0628: public void insertAttributeAt(/*@non_null@*/Attribute att,
0629: int position) {
0630:
0631: if ((position < 0) || (position > m_Attributes.size())) {
0632: throw new IllegalArgumentException("Index out of range");
0633: }
0634: att = (Attribute) att.copy();
0635: freshAttributeInfo();
0636: att.setIndex(position);
0637: m_Attributes.insertElementAt(att, position);
0638: for (int i = position + 1; i < m_Attributes.size(); i++) {
0639: Attribute current = (Attribute) m_Attributes.elementAt(i);
0640: current.setIndex(current.index() + 1);
0641: }
0642: for (int i = 0; i < numInstances(); i++) {
0643: instance(i).forceInsertAttributeAt(position);
0644: }
0645: if (m_ClassIndex >= position) {
0646: m_ClassIndex++;
0647: }
0648: }
0649:
0650: /**
0651: * Returns the instance at the given position.
0652: *
0653: * @param index the instance's index (index starts with 0)
0654: * @return the instance at the given position
0655: */
0656: //@ requires 0 <= index;
0657: //@ requires index < numInstances();
0658: public/*@non_null pure@*/Instance instance(int index) {
0659:
0660: return (Instance) m_Instances.elementAt(index);
0661: }
0662:
0663: /**
0664: * Returns the kth-smallest attribute value of a numeric attribute.
0665: * Note that calling this method will change the order of the data!
0666: *
0667: * @param att the Attribute object
0668: * @param k the value of k
0669: * @return the kth-smallest value
0670: */
0671: public double kthSmallestValue(Attribute att, int k) {
0672:
0673: return kthSmallestValue(att.index(), k);
0674: }
0675:
0676: /**
0677: * Returns the kth-smallest attribute value of a numeric attribute.
0678: * Note that calling this method will change the order of the data!
0679: * The number of non-missing values in the data must be as least
0680: * as last as k for this to work.
0681: *
0682: * @param attIndex the attribute's index
0683: * @param k the value of k
0684: * @return the kth-smallest value
0685: */
0686: public double kthSmallestValue(int attIndex, int k) {
0687:
0688: if (!attribute(attIndex).isNumeric()) {
0689: throw new IllegalArgumentException(
0690: "Instances: attribute must be numeric to compute kth-smallest value.");
0691: }
0692:
0693: int i, j;
0694:
0695: // move all instances with missing values to end
0696: j = numInstances() - 1;
0697: i = 0;
0698: while (i <= j) {
0699: if (instance(j).isMissing(attIndex)) {
0700: j--;
0701: } else {
0702: if (instance(i).isMissing(attIndex)) {
0703: swap(i, j);
0704: j--;
0705: }
0706: i++;
0707: }
0708: }
0709:
0710: if ((k < 0) || (k > j)) {
0711: throw new IllegalArgumentException(
0712: "Instances: value for k for computing kth-smallest value too large.");
0713: }
0714:
0715: return instance(select(attIndex, 0, j, k)).value(attIndex);
0716: }
0717:
0718: /**
0719: * Returns the last instance in the set.
0720: *
0721: * @return the last instance in the set
0722: */
0723: //@ requires numInstances() > 0;
0724: public/*@non_null pure@*/Instance lastInstance() {
0725:
0726: return (Instance) m_Instances.lastElement();
0727: }
0728:
0729: /**
0730: * Returns the mean (mode) for a numeric (nominal) attribute as
0731: * a floating-point value. Returns 0 if the attribute is neither nominal nor
0732: * numeric. If all values are missing it returns zero.
0733: *
0734: * @param attIndex the attribute's index (index starts with 0)
0735: * @return the mean or the mode
0736: */
0737: public/*@pure@*/double meanOrMode(int attIndex) {
0738:
0739: double result, found;
0740: int[] counts;
0741:
0742: if (attribute(attIndex).isNumeric()) {
0743: result = found = 0;
0744: for (int j = 0; j < numInstances(); j++) {
0745: if (!instance(j).isMissing(attIndex)) {
0746: found += instance(j).weight();
0747: result += instance(j).weight()
0748: * instance(j).value(attIndex);
0749: }
0750: }
0751: if (found <= 0) {
0752: return 0;
0753: } else {
0754: return result / found;
0755: }
0756: } else if (attribute(attIndex).isNominal()) {
0757: counts = new int[attribute(attIndex).numValues()];
0758: for (int j = 0; j < numInstances(); j++) {
0759: if (!instance(j).isMissing(attIndex)) {
0760: counts[(int) instance(j).value(attIndex)] += instance(
0761: j).weight();
0762: }
0763: }
0764: return (double) Utils.maxIndex(counts);
0765: } else {
0766: return 0;
0767: }
0768: }
0769:
0770: /**
0771: * Returns the mean (mode) for a numeric (nominal) attribute as a
0772: * floating-point value. Returns 0 if the attribute is neither
0773: * nominal nor numeric. If all values are missing it returns zero.
0774: *
0775: * @param att the attribute
0776: * @return the mean or the mode
0777: */
0778: public/*@pure@*/double meanOrMode(Attribute att) {
0779:
0780: return meanOrMode(att.index());
0781: }
0782:
0783: /**
0784: * Returns the number of attributes.
0785: *
0786: * @return the number of attributes as an integer
0787: */
0788: //@ ensures \result == m_Attributes.size();
0789: public/*@pure@*/int numAttributes() {
0790:
0791: return m_Attributes.size();
0792: }
0793:
0794: /**
0795: * Returns the number of class labels.
0796: *
0797: * @return the number of class labels as an integer if the class
0798: * attribute is nominal, 1 otherwise.
0799: * @throws UnassignedClassException if the class is not set
0800: */
0801: //@ requires classIndex() >= 0;
0802: public/*@pure@*/int numClasses() {
0803:
0804: if (m_ClassIndex < 0) {
0805: throw new UnassignedClassException(
0806: "Class index is negative (not set)!");
0807: }
0808: if (!classAttribute().isNominal()) {
0809: return 1;
0810: } else {
0811: return classAttribute().numValues();
0812: }
0813: }
0814:
0815: /**
0816: * Returns the number of distinct values of a given attribute.
0817: * Returns the number of instances if the attribute is a
0818: * string attribute. The value 'missing' is not counted.
0819: *
0820: * @param attIndex the attribute (index starts with 0)
0821: * @return the number of distinct values of a given attribute
0822: */
0823: //@ requires 0 <= attIndex;
0824: //@ requires attIndex < numAttributes();
0825: public/*@pure@*/int numDistinctValues(int attIndex) {
0826:
0827: if (attribute(attIndex).isNumeric()) {
0828: double[] attVals = attributeToDoubleArray(attIndex);
0829: int[] sorted = Utils.sort(attVals);
0830: double prev = 0;
0831: int counter = 0;
0832: for (int i = 0; i < sorted.length; i++) {
0833: Instance current = instance(sorted[i]);
0834: if (current.isMissing(attIndex)) {
0835: break;
0836: }
0837: if ((i == 0) || (current.value(attIndex) > prev)) {
0838: prev = current.value(attIndex);
0839: counter++;
0840: }
0841: }
0842: return counter;
0843: } else {
0844: return attribute(attIndex).numValues();
0845: }
0846: }
0847:
0848: /**
0849: * Returns the number of distinct values of a given attribute.
0850: * Returns the number of instances if the attribute is a
0851: * string attribute. The value 'missing' is not counted.
0852: *
0853: * @param att the attribute
0854: * @return the number of distinct values of a given attribute
0855: */
0856: public/*@pure@*/int numDistinctValues(/*@non_null@*/Attribute att) {
0857:
0858: return numDistinctValues(att.index());
0859: }
0860:
0861: /**
0862: * Returns the number of instances in the dataset.
0863: *
0864: * @return the number of instances in the dataset as an integer
0865: */
0866: //@ ensures \result == m_Instances.size();
0867: public/*@pure@*/int numInstances() {
0868:
0869: return m_Instances.size();
0870: }
0871:
0872: /**
0873: * Shuffles the instances in the set so that they are ordered
0874: * randomly.
0875: *
0876: * @param random a random number generator
0877: */
0878: public void randomize(Random random) {
0879:
0880: for (int j = numInstances() - 1; j > 0; j--)
0881: swap(j, random.nextInt(j + 1));
0882: }
0883:
0884: /**
0885: * Reads a single instance from the reader and appends it
0886: * to the dataset. Automatically expands the dataset if it
0887: * is not large enough to hold the instance. This method does
0888: * not check for carriage return at the end of the line.
0889: *
0890: * @param reader the reader
0891: * @return false if end of file has been reached
0892: * @throws IOException if the information is not read
0893: * successfully
0894: * @deprecated instead of using this method in conjunction with the
0895: * <code>readInstance(Reader)</code> method, one should use the
0896: * <code>ArffLoader</code> or <code>DataSource</code> class instead.
0897: * @see weka.core.converters.ArffLoader
0898: * @see weka.core.converters.ConverterUtils.DataSource
0899: */
0900: @Deprecated
0901: public boolean readInstance(Reader reader) throws IOException {
0902:
0903: ArffReader arff = new ArffReader(reader, this , m_Lines, 1);
0904: Instance inst = arff.readInstance(arff.getData(), false);
0905: m_Lines = arff.getLineNo();
0906: if (inst != null) {
0907: add(inst);
0908: return true;
0909: } else {
0910: return false;
0911: }
0912: }
0913:
0914: /**
0915: * Returns the relation's name.
0916: *
0917: * @return the relation's name as a string
0918: */
0919: //@ ensures \result == m_RelationName;
0920: public/*@pure@*/String relationName() {
0921:
0922: return m_RelationName;
0923: }
0924:
0925: /**
0926: * Renames an attribute. This change only affects this
0927: * dataset.
0928: *
0929: * @param att the attribute's index (index starts with 0)
0930: * @param name the new name
0931: */
0932: public void renameAttribute(int att, String name) {
0933:
0934: Attribute newAtt = attribute(att).copy(name);
0935: FastVector newVec = new FastVector(numAttributes());
0936:
0937: for (int i = 0; i < numAttributes(); i++) {
0938: if (i == att) {
0939: newVec.addElement(newAtt);
0940: } else {
0941: newVec.addElement(attribute(i));
0942: }
0943: }
0944: m_Attributes = newVec;
0945: }
0946:
0947: /**
0948: * Renames an attribute. This change only affects this
0949: * dataset.
0950: *
0951: * @param att the attribute
0952: * @param name the new name
0953: */
0954: public void renameAttribute(Attribute att, String name) {
0955:
0956: renameAttribute(att.index(), name);
0957: }
0958:
0959: /**
0960: * Renames the value of a nominal (or string) attribute value. This
0961: * change only affects this dataset.
0962: *
0963: * @param att the attribute's index (index starts with 0)
0964: * @param val the value's index (index starts with 0)
0965: * @param name the new name
0966: */
0967: public void renameAttributeValue(int att, int val, String name) {
0968:
0969: Attribute newAtt = (Attribute) attribute(att).copy();
0970: FastVector newVec = new FastVector(numAttributes());
0971:
0972: newAtt.setValue(val, name);
0973: for (int i = 0; i < numAttributes(); i++) {
0974: if (i == att) {
0975: newVec.addElement(newAtt);
0976: } else {
0977: newVec.addElement(attribute(i));
0978: }
0979: }
0980: m_Attributes = newVec;
0981: }
0982:
0983: /**
0984: * Renames the value of a nominal (or string) attribute value. This
0985: * change only affects this dataset.
0986: *
0987: * @param att the attribute
0988: * @param val the value
0989: * @param name the new name
0990: */
0991: public void renameAttributeValue(Attribute att, String val,
0992: String name) {
0993:
0994: int v = att.indexOfValue(val);
0995: if (v == -1)
0996: throw new IllegalArgumentException(val + " not found");
0997: renameAttributeValue(att.index(), v, name);
0998: }
0999:
1000: /**
1001: * Creates a new dataset of the same size using random sampling
1002: * with replacement.
1003: *
1004: * @param random a random number generator
1005: * @return the new dataset
1006: */
1007: public Instances resample(Random random) {
1008:
1009: Instances newData = new Instances(this , numInstances());
1010: while (newData.numInstances() < numInstances()) {
1011: newData.add(instance(random.nextInt(numInstances())));
1012: }
1013: return newData;
1014: }
1015:
1016: /**
1017: * Creates a new dataset of the same size using random sampling
1018: * with replacement according to the current instance weights. The
1019: * weights of the instances in the new dataset are set to one.
1020: *
1021: * @param random a random number generator
1022: * @return the new dataset
1023: */
1024: public Instances resampleWithWeights(Random random) {
1025:
1026: double[] weights = new double[numInstances()];
1027: for (int i = 0; i < weights.length; i++) {
1028: weights[i] = instance(i).weight();
1029: }
1030: return resampleWithWeights(random, weights);
1031: }
1032:
1033: /**
1034: * Creates a new dataset of the same size using random sampling
1035: * with replacement according to the given weight vector. The
1036: * weights of the instances in the new dataset are set to one.
1037: * The length of the weight vector has to be the same as the
1038: * number of instances in the dataset, and all weights have to
1039: * be positive.
1040: *
1041: * @param random a random number generator
1042: * @param weights the weight vector
1043: * @return the new dataset
1044: * @throws IllegalArgumentException if the weights array is of the wrong
1045: * length or contains negative weights.
1046: */
1047: public Instances resampleWithWeights(Random random, double[] weights) {
1048:
1049: if (weights.length != numInstances()) {
1050: throw new IllegalArgumentException(
1051: "weights.length != numInstances.");
1052: }
1053: Instances newData = new Instances(this , numInstances());
1054: if (numInstances() == 0) {
1055: return newData;
1056: }
1057: double[] probabilities = new double[numInstances()];
1058: double sumProbs = 0, sumOfWeights = Utils.sum(weights);
1059: for (int i = 0; i < numInstances(); i++) {
1060: sumProbs += random.nextDouble();
1061: probabilities[i] = sumProbs;
1062: }
1063: Utils.normalize(probabilities, sumProbs / sumOfWeights);
1064:
1065: // Make sure that rounding errors don't mess things up
1066: probabilities[numInstances() - 1] = sumOfWeights;
1067: int k = 0;
1068: int l = 0;
1069: sumProbs = 0;
1070: while ((k < numInstances() && (l < numInstances()))) {
1071: if (weights[l] < 0) {
1072: throw new IllegalArgumentException(
1073: "Weights have to be positive.");
1074: }
1075: sumProbs += weights[l];
1076: while ((k < numInstances())
1077: && (probabilities[k] <= sumProbs)) {
1078: newData.add(instance(l));
1079: newData.instance(k).setWeight(1);
1080: k++;
1081: }
1082: l++;
1083: }
1084: return newData;
1085: }
1086:
1087: /**
1088: * Sets the class attribute.
1089: *
1090: * @param att attribute to be the class
1091: */
1092: public void setClass(Attribute att) {
1093:
1094: m_ClassIndex = att.index();
1095: }
1096:
1097: /**
1098: * Sets the class index of the set.
1099: * If the class index is negative there is assumed to be no class.
1100: * (ie. it is undefined)
1101: *
1102: * @param classIndex the new class index (index starts with 0)
1103: * @throws IllegalArgumentException if the class index is too big or < 0
1104: */
1105: public void setClassIndex(int classIndex) {
1106:
1107: if (classIndex >= numAttributes()) {
1108: throw new IllegalArgumentException("Invalid class index: "
1109: + classIndex);
1110: }
1111: m_ClassIndex = classIndex;
1112: }
1113:
1114: /**
1115: * Sets the relation's name.
1116: *
1117: * @param newName the new relation name.
1118: */
1119: public void setRelationName(/*@non_null@*/String newName) {
1120:
1121: m_RelationName = newName;
1122: }
1123:
1124: /**
1125: * Sorts the instances based on an attribute. For numeric attributes,
1126: * instances are sorted in ascending order. For nominal attributes,
1127: * instances are sorted based on the attribute label ordering
1128: * specified in the header. Instances with missing values for the
1129: * attribute are placed at the end of the dataset.
1130: *
1131: * @param attIndex the attribute's index (index starts with 0)
1132: */
1133: public void sort(int attIndex) {
1134:
1135: int i, j;
1136:
1137: // move all instances with missing values to end
1138: j = numInstances() - 1;
1139: i = 0;
1140: while (i <= j) {
1141: if (instance(j).isMissing(attIndex)) {
1142: j--;
1143: } else {
1144: if (instance(i).isMissing(attIndex)) {
1145: swap(i, j);
1146: j--;
1147: }
1148: i++;
1149: }
1150: }
1151: quickSort(attIndex, 0, j);
1152: }
1153:
1154: /**
1155: * Sorts the instances based on an attribute. For numeric attributes,
1156: * instances are sorted into ascending order. For nominal attributes,
1157: * instances are sorted based on the attribute label ordering
1158: * specified in the header. Instances with missing values for the
1159: * attribute are placed at the end of the dataset.
1160: *
1161: * @param att the attribute
1162: */
1163: public void sort(Attribute att) {
1164:
1165: sort(att.index());
1166: }
1167:
1168: /**
1169: * Stratifies a set of instances according to its class values
1170: * if the class attribute is nominal (so that afterwards a
1171: * stratified cross-validation can be performed).
1172: *
1173: * @param numFolds the number of folds in the cross-validation
1174: * @throws UnassignedClassException if the class is not set
1175: */
1176: public void stratify(int numFolds) {
1177:
1178: if (numFolds <= 0) {
1179: throw new IllegalArgumentException(
1180: "Number of folds must be greater than 1");
1181: }
1182: if (m_ClassIndex < 0) {
1183: throw new UnassignedClassException(
1184: "Class index is negative (not set)!");
1185: }
1186: if (classAttribute().isNominal()) {
1187:
1188: // sort by class
1189: int index = 1;
1190: while (index < numInstances()) {
1191: Instance instance1 = instance(index - 1);
1192: for (int j = index; j < numInstances(); j++) {
1193: Instance instance2 = instance(j);
1194: if ((instance1.classValue() == instance2
1195: .classValue())
1196: || (instance1.classIsMissing() && instance2
1197: .classIsMissing())) {
1198: swap(index, j);
1199: index++;
1200: }
1201: }
1202: index++;
1203: }
1204: stratStep(numFolds);
1205: }
1206: }
1207:
1208: /**
1209: * Computes the sum of all the instances' weights.
1210: *
1211: * @return the sum of all the instances' weights as a double
1212: */
1213: public/*@pure@*/double sumOfWeights() {
1214:
1215: double sum = 0;
1216:
1217: for (int i = 0; i < numInstances(); i++) {
1218: sum += instance(i).weight();
1219: }
1220: return sum;
1221: }
1222:
1223: /**
1224: * Creates the test set for one fold of a cross-validation on
1225: * the dataset.
1226: *
1227: * @param numFolds the number of folds in the cross-validation. Must
1228: * be greater than 1.
1229: * @param numFold 0 for the first fold, 1 for the second, ...
1230: * @return the test set as a set of weighted instances
1231: * @throws IllegalArgumentException if the number of folds is less than 2
1232: * or greater than the number of instances.
1233: */
1234: //@ requires 2 <= numFolds && numFolds < numInstances();
1235: //@ requires 0 <= numFold && numFold < numFolds;
1236: public Instances testCV(int numFolds, int numFold) {
1237:
1238: int numInstForFold, first, offset;
1239: Instances test;
1240:
1241: if (numFolds < 2) {
1242: throw new IllegalArgumentException(
1243: "Number of folds must be at least 2!");
1244: }
1245: if (numFolds > numInstances()) {
1246: throw new IllegalArgumentException(
1247: "Can't have more folds than instances!");
1248: }
1249: numInstForFold = numInstances() / numFolds;
1250: if (numFold < numInstances() % numFolds) {
1251: numInstForFold++;
1252: offset = numFold;
1253: } else
1254: offset = numInstances() % numFolds;
1255: test = new Instances(this , numInstForFold);
1256: first = numFold * (numInstances() / numFolds) + offset;
1257: copyInstances(first, test, numInstForFold);
1258: return test;
1259: }
1260:
1261: /**
1262: * Returns the dataset as a string in ARFF format. Strings
1263: * are quoted if they contain whitespace characters, or if they
1264: * are a question mark.
1265: *
1266: * @return the dataset in ARFF format as a string
1267: */
1268: public String toString() {
1269:
1270: StringBuffer text = new StringBuffer();
1271:
1272: text.append(ARFF_RELATION).append(" ").append(
1273: Utils.quote(m_RelationName)).append("\n\n");
1274: for (int i = 0; i < numAttributes(); i++) {
1275: text.append(attribute(i)).append("\n");
1276: }
1277: text.append("\n").append(ARFF_DATA).append("\n");
1278:
1279: text.append(stringWithoutHeader());
1280: return text.toString();
1281: }
1282:
1283: /**
1284: * Returns the instances in the dataset as a string in ARFF format. Strings
1285: * are quoted if they contain whitespace characters, or if they
1286: * are a question mark.
1287: *
1288: * @return the dataset in ARFF format as a string
1289: */
1290: protected String stringWithoutHeader() {
1291:
1292: StringBuffer text = new StringBuffer();
1293:
1294: for (int i = 0; i < numInstances(); i++) {
1295: text.append(instance(i));
1296: if (i < numInstances() - 1) {
1297: text.append('\n');
1298: }
1299: }
1300: return text.toString();
1301: }
1302:
1303: /**
1304: * Creates the training set for one fold of a cross-validation
1305: * on the dataset.
1306: *
1307: * @param numFolds the number of folds in the cross-validation. Must
1308: * be greater than 1.
1309: * @param numFold 0 for the first fold, 1 for the second, ...
1310: * @return the training set
1311: * @throws IllegalArgumentException if the number of folds is less than 2
1312: * or greater than the number of instances.
1313: */
1314: //@ requires 2 <= numFolds && numFolds < numInstances();
1315: //@ requires 0 <= numFold && numFold < numFolds;
1316: public Instances trainCV(int numFolds, int numFold) {
1317:
1318: int numInstForFold, first, offset;
1319: Instances train;
1320:
1321: if (numFolds < 2) {
1322: throw new IllegalArgumentException(
1323: "Number of folds must be at least 2!");
1324: }
1325: if (numFolds > numInstances()) {
1326: throw new IllegalArgumentException(
1327: "Can't have more folds than instances!");
1328: }
1329: numInstForFold = numInstances() / numFolds;
1330: if (numFold < numInstances() % numFolds) {
1331: numInstForFold++;
1332: offset = numFold;
1333: } else
1334: offset = numInstances() % numFolds;
1335: train = new Instances(this , numInstances() - numInstForFold);
1336: first = numFold * (numInstances() / numFolds) + offset;
1337: copyInstances(0, train, first);
1338: copyInstances(first + numInstForFold, train, numInstances()
1339: - first - numInstForFold);
1340:
1341: return train;
1342: }
1343:
1344: /**
1345: * Creates the training set for one fold of a cross-validation
1346: * on the dataset. The data is subsequently randomized based
1347: * on the given random number generator.
1348: *
1349: * @param numFolds the number of folds in the cross-validation. Must
1350: * be greater than 1.
1351: * @param numFold 0 for the first fold, 1 for the second, ...
1352: * @param random the random number generator
1353: * @return the training set
1354: * @throws IllegalArgumentException if the number of folds is less than 2
1355: * or greater than the number of instances.
1356: */
1357: //@ requires 2 <= numFolds && numFolds < numInstances();
1358: //@ requires 0 <= numFold && numFold < numFolds;
1359: public Instances trainCV(int numFolds, int numFold, Random random) {
1360:
1361: Instances train = trainCV(numFolds, numFold);
1362: train.randomize(random);
1363: return train;
1364: }
1365:
1366: /**
1367: * Computes the variance for a numeric attribute.
1368: *
1369: * @param attIndex the numeric attribute (index starts with 0)
1370: * @return the variance if the attribute is numeric
1371: * @throws IllegalArgumentException if the attribute is not numeric
1372: */
1373: public/*@pure@*/double variance(int attIndex) {
1374:
1375: double sum = 0, sumSquared = 0, sumOfWeights = 0;
1376:
1377: if (!attribute(attIndex).isNumeric()) {
1378: throw new IllegalArgumentException(
1379: "Can't compute variance because attribute is "
1380: + "not numeric!");
1381: }
1382: for (int i = 0; i < numInstances(); i++) {
1383: if (!instance(i).isMissing(attIndex)) {
1384: sum += instance(i).weight()
1385: * instance(i).value(attIndex);
1386: sumSquared += instance(i).weight()
1387: * instance(i).value(attIndex)
1388: * instance(i).value(attIndex);
1389: sumOfWeights += instance(i).weight();
1390: }
1391: }
1392: if (sumOfWeights <= 1) {
1393: return 0;
1394: }
1395: double result = (sumSquared - (sum * sum / sumOfWeights))
1396: / (sumOfWeights - 1);
1397:
1398: // We don't like negative variance
1399: if (result < 0) {
1400: return 0;
1401: } else {
1402: return result;
1403: }
1404: }
1405:
1406: /**
1407: * Computes the variance for a numeric attribute.
1408: *
1409: * @param att the numeric attribute
1410: * @return the variance if the attribute is numeric
1411: * @throws IllegalArgumentException if the attribute is not numeric
1412: */
1413: public/*@pure@*/double variance(Attribute att) {
1414:
1415: return variance(att.index());
1416: }
1417:
1418: /**
1419: * Calculates summary statistics on the values that appear in this
1420: * set of instances for a specified attribute.
1421: *
1422: * @param index the index of the attribute to summarize (index starts with 0)
1423: * @return an AttributeStats object with it's fields calculated.
1424: */
1425: //@ requires 0 <= index && index < numAttributes();
1426: public AttributeStats attributeStats(int index) {
1427:
1428: AttributeStats result = new AttributeStats();
1429: if (attribute(index).isNominal()) {
1430: result.nominalCounts = new int[attribute(index).numValues()];
1431: }
1432: if (attribute(index).isNumeric()) {
1433: result.numericStats = new weka.experiment.Stats();
1434: }
1435: result.totalCount = numInstances();
1436:
1437: double[] attVals = attributeToDoubleArray(index);
1438: int[] sorted = Utils.sort(attVals);
1439: int currentCount = 0;
1440: double prev = Instance.missingValue();
1441: for (int j = 0; j < numInstances(); j++) {
1442: Instance current = instance(sorted[j]);
1443: if (current.isMissing(index)) {
1444: result.missingCount = numInstances() - j;
1445: break;
1446: }
1447: if (current.value(index) == prev) {
1448: currentCount++;
1449: } else {
1450: result.addDistinct(prev, currentCount);
1451: currentCount = 1;
1452: prev = current.value(index);
1453: }
1454: }
1455: result.addDistinct(prev, currentCount);
1456: result.distinctCount--; // So we don't count "missing" as a value
1457: return result;
1458: }
1459:
1460: /**
1461: * Gets the value of all instances in this dataset for a particular
1462: * attribute. Useful in conjunction with Utils.sort to allow iterating
1463: * through the dataset in sorted order for some attribute.
1464: *
1465: * @param index the index of the attribute.
1466: * @return an array containing the value of the desired attribute for
1467: * each instance in the dataset.
1468: */
1469: //@ requires 0 <= index && index < numAttributes();
1470: public/*@pure@*/double[] attributeToDoubleArray(int index) {
1471:
1472: double[] result = new double[numInstances()];
1473: for (int i = 0; i < result.length; i++) {
1474: result[i] = instance(i).value(index);
1475: }
1476: return result;
1477: }
1478:
1479: /**
1480: * Generates a string summarizing the set of instances. Gives a breakdown
1481: * for each attribute indicating the number of missing/discrete/unique
1482: * values and other information.
1483: *
1484: * @return a string summarizing the dataset
1485: */
1486: public String toSummaryString() {
1487:
1488: StringBuffer result = new StringBuffer();
1489: result.append("Relation Name: ").append(relationName())
1490: .append('\n');
1491: result.append("Num Instances: ").append(numInstances())
1492: .append('\n');
1493: result.append("Num Attributes: ").append(numAttributes())
1494: .append('\n');
1495: result.append('\n');
1496:
1497: result.append(Utils.padLeft("", 5)).append(
1498: Utils.padRight("Name", 25));
1499: result.append(Utils.padLeft("Type", 5)).append(
1500: Utils.padLeft("Nom", 5));
1501: result.append(Utils.padLeft("Int", 5)).append(
1502: Utils.padLeft("Real", 5));
1503: result.append(Utils.padLeft("Missing", 12));
1504: result.append(Utils.padLeft("Unique", 12));
1505: result.append(Utils.padLeft("Dist", 6)).append('\n');
1506: for (int i = 0; i < numAttributes(); i++) {
1507: Attribute a = attribute(i);
1508: AttributeStats as = attributeStats(i);
1509: result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
1510: result.append(Utils.padRight(a.name(), 25)).append(' ');
1511: long percent;
1512: switch (a.type()) {
1513: case Attribute.NOMINAL:
1514: result.append(Utils.padLeft("Nom", 4)).append(' ');
1515: percent = Math.round(100.0 * as.intCount
1516: / as.totalCount);
1517: result.append(Utils.padLeft("" + percent, 3)).append(
1518: "% ");
1519: result.append(Utils.padLeft("" + 0, 3)).append("% ");
1520: percent = Math.round(100.0 * as.realCount
1521: / as.totalCount);
1522: result.append(Utils.padLeft("" + percent, 3)).append(
1523: "% ");
1524: break;
1525: case Attribute.NUMERIC:
1526: result.append(Utils.padLeft("Num", 4)).append(' ');
1527: result.append(Utils.padLeft("" + 0, 3)).append("% ");
1528: percent = Math.round(100.0 * as.intCount
1529: / as.totalCount);
1530: result.append(Utils.padLeft("" + percent, 3)).append(
1531: "% ");
1532: percent = Math.round(100.0 * as.realCount
1533: / as.totalCount);
1534: result.append(Utils.padLeft("" + percent, 3)).append(
1535: "% ");
1536: break;
1537: case Attribute.DATE:
1538: result.append(Utils.padLeft("Dat", 4)).append(' ');
1539: result.append(Utils.padLeft("" + 0, 3)).append("% ");
1540: percent = Math.round(100.0 * as.intCount
1541: / as.totalCount);
1542: result.append(Utils.padLeft("" + percent, 3)).append(
1543: "% ");
1544: percent = Math.round(100.0 * as.realCount
1545: / as.totalCount);
1546: result.append(Utils.padLeft("" + percent, 3)).append(
1547: "% ");
1548: break;
1549: case Attribute.STRING:
1550: result.append(Utils.padLeft("Str", 4)).append(' ');
1551: percent = Math.round(100.0 * as.intCount
1552: / as.totalCount);
1553: result.append(Utils.padLeft("" + percent, 3)).append(
1554: "% ");
1555: result.append(Utils.padLeft("" + 0, 3)).append("% ");
1556: percent = Math.round(100.0 * as.realCount
1557: / as.totalCount);
1558: result.append(Utils.padLeft("" + percent, 3)).append(
1559: "% ");
1560: break;
1561: case Attribute.RELATIONAL:
1562: result.append(Utils.padLeft("Rel", 4)).append(' ');
1563: percent = Math.round(100.0 * as.intCount
1564: / as.totalCount);
1565: result.append(Utils.padLeft("" + percent, 3)).append(
1566: "% ");
1567: result.append(Utils.padLeft("" + 0, 3)).append("% ");
1568: percent = Math.round(100.0 * as.realCount
1569: / as.totalCount);
1570: result.append(Utils.padLeft("" + percent, 3)).append(
1571: "% ");
1572: break;
1573: default:
1574: result.append(Utils.padLeft("???", 4)).append(' ');
1575: result.append(Utils.padLeft("" + 0, 3)).append("% ");
1576: percent = Math.round(100.0 * as.intCount
1577: / as.totalCount);
1578: result.append(Utils.padLeft("" + percent, 3)).append(
1579: "% ");
1580: percent = Math.round(100.0 * as.realCount
1581: / as.totalCount);
1582: result.append(Utils.padLeft("" + percent, 3)).append(
1583: "% ");
1584: break;
1585: }
1586: result.append(Utils.padLeft("" + as.missingCount, 5))
1587: .append(" /");
1588: percent = Math.round(100.0 * as.missingCount
1589: / as.totalCount);
1590: result.append(Utils.padLeft("" + percent, 3)).append("% ");
1591: result.append(Utils.padLeft("" + as.uniqueCount, 5))
1592: .append(" /");
1593: percent = Math
1594: .round(100.0 * as.uniqueCount / as.totalCount);
1595: result.append(Utils.padLeft("" + percent, 3)).append("% ");
1596: result.append(Utils.padLeft("" + as.distinctCount, 5))
1597: .append(' ');
1598: result.append('\n');
1599: }
1600: return result.toString();
1601: }
1602:
1603: /**
1604: * Copies instances from one set to the end of another
1605: * one.
1606: *
1607: * @param from the position of the first instance to be copied
1608: * @param dest the destination for the instances
1609: * @param num the number of instances to be copied
1610: */
1611: //@ requires 0 <= from && from <= numInstances() - num;
1612: //@ requires 0 <= num;
1613: protected void copyInstances(int from, /*@non_null@*/
1614: Instances dest, int num) {
1615:
1616: for (int i = 0; i < num; i++) {
1617: dest.add(instance(from + i));
1618: }
1619: }
1620:
1621: /**
1622: * Replaces the attribute information by a clone of
1623: * itself.
1624: */
1625: protected void freshAttributeInfo() {
1626:
1627: m_Attributes = (FastVector) m_Attributes.copyElements();
1628: }
1629:
1630: /**
1631: * Returns string including all instances, their weights and
1632: * their indices in the original dataset.
1633: *
1634: * @return description of instance and its weight as a string
1635: */
1636: protected/*@pure@*/String instancesAndWeights() {
1637:
1638: StringBuffer text = new StringBuffer();
1639:
1640: for (int i = 0; i < numInstances(); i++) {
1641: text.append(instance(i) + " " + instance(i).weight());
1642: if (i < numInstances() - 1) {
1643: text.append("\n");
1644: }
1645: }
1646: return text.toString();
1647: }
1648:
1649: /**
1650: * Partitions the instances around a pivot. Used by quicksort and
1651: * kthSmallestValue.
1652: *
1653: * @param attIndex the attribute's index (index starts with 0)
1654: * @param l the first index of the subset (index starts with 0)
1655: * @param r the last index of the subset (index starts with 0)
1656: *
1657: * @return the index of the middle element
1658: */
1659: //@ requires 0 <= attIndex && attIndex < numAttributes();
1660: //@ requires 0 <= left && left <= right && right < numInstances();
1661: protected int partition(int attIndex, int l, int r) {
1662:
1663: double pivot = instance((l + r) / 2).value(attIndex);
1664:
1665: while (l < r) {
1666: while ((instance(l).value(attIndex) < pivot) && (l < r)) {
1667: l++;
1668: }
1669: while ((instance(r).value(attIndex) > pivot) && (l < r)) {
1670: r--;
1671: }
1672: if (l < r) {
1673: swap(l, r);
1674: l++;
1675: r--;
1676: }
1677: }
1678: if ((l == r) && (instance(r).value(attIndex) > pivot)) {
1679: r--;
1680: }
1681:
1682: return r;
1683: }
1684:
1685: /**
1686: * Implements quicksort according to Manber's "Introduction to
1687: * Algorithms".
1688: *
1689: * @param attIndex the attribute's index (index starts with 0)
1690: * @param left the first index of the subset to be sorted (index starts with 0)
1691: * @param right the last index of the subset to be sorted (index starts with 0)
1692: */
1693: //@ requires 0 <= attIndex && attIndex < numAttributes();
1694: //@ requires 0 <= first && first <= right && right < numInstances();
1695: protected void quickSort(int attIndex, int left, int right) {
1696:
1697: if (left < right) {
1698: int middle = partition(attIndex, left, right);
1699: quickSort(attIndex, left, middle);
1700: quickSort(attIndex, middle + 1, right);
1701: }
1702: }
1703:
1704: /**
1705: * Implements computation of the kth-smallest element according
1706: * to Manber's "Introduction to Algorithms".
1707: *
1708: * @param attIndex the attribute's index (index starts with 0)
1709: * @param left the first index of the subset (index starts with 0)
1710: * @param right the last index of the subset (index starts with 0)
1711: * @param k the value of k
1712: *
1713: * @return the index of the kth-smallest element
1714: */
1715: //@ requires 0 <= attIndex && attIndex < numAttributes();
1716: //@ requires 0 <= first && first <= right && right < numInstances();
1717: protected int select(int attIndex, int left, int right, int k) {
1718:
1719: if (left == right) {
1720: return left;
1721: } else {
1722: int middle = partition(attIndex, left, right);
1723: if ((middle - left + 1) >= k) {
1724: return select(attIndex, left, middle, k);
1725: } else {
1726: return select(attIndex, middle + 1, right, k
1727: - (middle - left + 1));
1728: }
1729: }
1730: }
1731:
1732: /**
1733: * Help function needed for stratification of set.
1734: *
1735: * @param numFolds the number of folds for the stratification
1736: */
1737: protected void stratStep(int numFolds) {
1738:
1739: FastVector newVec = new FastVector(m_Instances.capacity());
1740: int start = 0, j;
1741:
1742: // create stratified batch
1743: while (newVec.size() < numInstances()) {
1744: j = start;
1745: while (j < numInstances()) {
1746: newVec.addElement(instance(j));
1747: j = j + numFolds;
1748: }
1749: start++;
1750: }
1751: m_Instances = newVec;
1752: }
1753:
1754: /**
1755: * Swaps two instances in the set.
1756: *
1757: * @param i the first instance's index (index starts with 0)
1758: * @param j the second instance's index (index starts with 0)
1759: */
1760: //@ requires 0 <= i && i < numInstances();
1761: //@ requires 0 <= j && j < numInstances();
1762: public void swap(int i, int j) {
1763:
1764: m_Instances.swap(i, j);
1765: }
1766:
1767: /**
1768: * Merges two sets of Instances together. The resulting set will have
1769: * all the attributes of the first set plus all the attributes of the
1770: * second set. The number of instances in both sets must be the same.
1771: *
1772: * @param first the first set of Instances
1773: * @param second the second set of Instances
1774: * @return the merged set of Instances
1775: * @throws IllegalArgumentException if the datasets are not the same size
1776: */
1777: public static Instances mergeInstances(Instances first,
1778: Instances second) {
1779:
1780: if (first.numInstances() != second.numInstances()) {
1781: throw new IllegalArgumentException(
1782: "Instance sets must be of the same size");
1783: }
1784:
1785: // Create the vector of merged attributes
1786: FastVector newAttributes = new FastVector();
1787: for (int i = 0; i < first.numAttributes(); i++) {
1788: newAttributes.addElement(first.attribute(i));
1789: }
1790: for (int i = 0; i < second.numAttributes(); i++) {
1791: newAttributes.addElement(second.attribute(i));
1792: }
1793:
1794: // Create the set of Instances
1795: Instances merged = new Instances(first.relationName() + '_'
1796: + second.relationName(), newAttributes, first
1797: .numInstances());
1798: // Merge each instance
1799: for (int i = 0; i < first.numInstances(); i++) {
1800: merged.add(first.instance(i).mergeInstance(
1801: second.instance(i)));
1802: }
1803: return merged;
1804: }
1805:
1806: /**
1807: * Method for testing this class.
1808: *
1809: * @param argv should contain one element: the name of an ARFF file
1810: */
1811: //@ requires argv != null;
1812: //@ requires argv.length == 1;
1813: //@ requires argv[0] != null;
1814: public static void test(String[] argv) {
1815:
1816: Instances instances, secondInstances, train, test, empty;
1817: Random random = new Random(2);
1818: Reader reader;
1819: int start, num;
1820: FastVector testAtts, testVals;
1821: int i, j;
1822:
1823: try {
1824: if (argv.length > 1) {
1825: throw (new Exception("Usage: Instances [<filename>]"));
1826: }
1827:
1828: // Creating set of instances from scratch
1829: testVals = new FastVector(2);
1830: testVals.addElement("first_value");
1831: testVals.addElement("second_value");
1832: testAtts = new FastVector(2);
1833: testAtts.addElement(new Attribute("nominal_attribute",
1834: testVals));
1835: testAtts.addElement(new Attribute("numeric_attribute"));
1836: instances = new Instances("test_set", testAtts, 10);
1837: instances.add(new Instance(instances.numAttributes()));
1838: instances.add(new Instance(instances.numAttributes()));
1839: instances.add(new Instance(instances.numAttributes()));
1840: instances.setClassIndex(0);
1841: System.out
1842: .println("\nSet of instances created from scratch:\n");
1843: System.out.println(instances);
1844:
1845: if (argv.length == 1) {
1846: String filename = argv[0];
1847: reader = new FileReader(filename);
1848:
1849: // Read first five instances and print them
1850: System.out
1851: .println("\nFirst five instances from file:\n");
1852: instances = new Instances(reader, 1);
1853: instances.setClassIndex(instances.numAttributes() - 1);
1854: i = 0;
1855: while ((i < 5) && (instances.readInstance(reader))) {
1856: i++;
1857: }
1858: System.out.println(instances);
1859:
1860: // Read all the instances in the file
1861: reader = new FileReader(filename);
1862: instances = new Instances(reader);
1863:
1864: // Make the last attribute be the class
1865: instances.setClassIndex(instances.numAttributes() - 1);
1866:
1867: // Print header and instances.
1868: System.out.println("\nDataset:\n");
1869: System.out.println(instances);
1870: System.out.println("\nClass index: "
1871: + instances.classIndex());
1872: }
1873:
1874: // Test basic methods based on class index.
1875: System.out.println("\nClass name: "
1876: + instances.classAttribute().name());
1877: System.out.println("\nClass index: "
1878: + instances.classIndex());
1879: System.out.println("\nClass is nominal: "
1880: + instances.classAttribute().isNominal());
1881: System.out.println("\nClass is numeric: "
1882: + instances.classAttribute().isNumeric());
1883: System.out.println("\nClasses:\n");
1884: for (i = 0; i < instances.numClasses(); i++) {
1885: System.out.println(instances.classAttribute().value(i));
1886: }
1887: System.out
1888: .println("\nClass values and labels of instances:\n");
1889: for (i = 0; i < instances.numInstances(); i++) {
1890: Instance inst = instances.instance(i);
1891: System.out.print(inst.classValue() + "\t");
1892: System.out.print(inst.toString(inst.classIndex()));
1893: if (instances.instance(i).classIsMissing()) {
1894: System.out.println("\tis missing");
1895: } else {
1896: System.out.println();
1897: }
1898: }
1899:
1900: // Create random weights.
1901: System.out
1902: .println("\nCreating random weights for instances.");
1903: for (i = 0; i < instances.numInstances(); i++) {
1904: instances.instance(i).setWeight(random.nextDouble());
1905: }
1906:
1907: // Print all instances and their weights (and the sum of weights).
1908: System.out.println("\nInstances and their weights:\n");
1909: System.out.println(instances.instancesAndWeights());
1910: System.out.print("\nSum of weights: ");
1911: System.out.println(instances.sumOfWeights());
1912:
1913: // Insert an attribute
1914: secondInstances = new Instances(instances);
1915: Attribute testAtt = new Attribute("Inserted");
1916: secondInstances.insertAttributeAt(testAtt, 0);
1917: System.out.println("\nSet with inserted attribute:\n");
1918: System.out.println(secondInstances);
1919: System.out.println("\nClass name: "
1920: + secondInstances.classAttribute().name());
1921:
1922: // Delete the attribute
1923: secondInstances.deleteAttributeAt(0);
1924: System.out.println("\nSet with attribute deleted:\n");
1925: System.out.println(secondInstances);
1926: System.out.println("\nClass name: "
1927: + secondInstances.classAttribute().name());
1928:
1929: // Test if headers are equal
1930: System.out.println("\nHeaders equal: "
1931: + instances.equalHeaders(secondInstances) + "\n");
1932:
1933: // Print data in internal format.
1934: System.out.println("\nData (internal values):\n");
1935: for (i = 0; i < instances.numInstances(); i++) {
1936: for (j = 0; j < instances.numAttributes(); j++) {
1937: if (instances.instance(i).isMissing(j)) {
1938: System.out.print("? ");
1939: } else {
1940: System.out.print(instances.instance(i).value(j)
1941: + " ");
1942: }
1943: }
1944: System.out.println();
1945: }
1946:
1947: // Just print header
1948: System.out.println("\nEmpty dataset:\n");
1949: empty = new Instances(instances, 0);
1950: System.out.println(empty);
1951: System.out.println("\nClass name: "
1952: + empty.classAttribute().name());
1953:
1954: // Create copy and rename an attribute and a value (if possible)
1955: if (empty.classAttribute().isNominal()) {
1956: Instances copy = new Instances(empty, 0);
1957: copy.renameAttribute(copy.classAttribute(), "new_name");
1958: copy.renameAttributeValue(copy.classAttribute(), copy
1959: .classAttribute().value(0), "new_val_name");
1960: System.out.println("\nDataset with names changed:\n"
1961: + copy);
1962: System.out.println("\nOriginal dataset:\n" + empty);
1963: }
1964:
1965: // Create and prints subset of instances.
1966: start = instances.numInstances() / 4;
1967: num = instances.numInstances() / 2;
1968: System.out.print("\nSubset of dataset: ");
1969: System.out.println(num + " instances from " + (start + 1)
1970: + ". instance");
1971: secondInstances = new Instances(instances, start, num);
1972: System.out.println("\nClass name: "
1973: + secondInstances.classAttribute().name());
1974:
1975: // Print all instances and their weights (and the sum of weights).
1976: System.out.println("\nInstances and their weights:\n");
1977: System.out.println(secondInstances.instancesAndWeights());
1978: System.out.print("\nSum of weights: ");
1979: System.out.println(secondInstances.sumOfWeights());
1980:
1981: // Create and print training and test sets for 3-fold
1982: // cross-validation.
1983: System.out.println("\nTrain and test folds for 3-fold CV:");
1984: if (instances.classAttribute().isNominal()) {
1985: instances.stratify(3);
1986: }
1987: for (j = 0; j < 3; j++) {
1988: train = instances.trainCV(3, j, new Random(1));
1989: test = instances.testCV(3, j);
1990:
1991: // Print all instances and their weights (and the sum of weights).
1992: System.out.println("\nTrain: ");
1993: System.out.println("\nInstances and their weights:\n");
1994: System.out.println(train.instancesAndWeights());
1995: System.out.print("\nSum of weights: ");
1996: System.out.println(train.sumOfWeights());
1997: System.out.println("\nClass name: "
1998: + train.classAttribute().name());
1999: System.out.println("\nTest: ");
2000: System.out.println("\nInstances and their weights:\n");
2001: System.out.println(test.instancesAndWeights());
2002: System.out.print("\nSum of weights: ");
2003: System.out.println(test.sumOfWeights());
2004: System.out.println("\nClass name: "
2005: + test.classAttribute().name());
2006: }
2007:
2008: // Randomize instances and print them.
2009: System.out.println("\nRandomized dataset:");
2010: instances.randomize(random);
2011:
2012: // Print all instances and their weights (and the sum of weights).
2013: System.out.println("\nInstances and their weights:\n");
2014: System.out.println(instances.instancesAndWeights());
2015: System.out.print("\nSum of weights: ");
2016: System.out.println(instances.sumOfWeights());
2017:
2018: // Sort instances according to first attribute and
2019: // print them.
2020: System.out
2021: .print("\nInstances sorted according to first attribute:\n ");
2022: instances.sort(0);
2023:
2024: // Print all instances and their weights (and the sum of weights).
2025: System.out.println("\nInstances and their weights:\n");
2026: System.out.println(instances.instancesAndWeights());
2027: System.out.print("\nSum of weights: ");
2028: System.out.println(instances.sumOfWeights());
2029: } catch (Exception e) {
2030: e.printStackTrace();
2031: }
2032: }
2033:
2034: /**
2035: * Main method for this class. The following calls are possible:
2036: * <ul>
2037: * <li>
2038: * <code>weka.core.Instances</code> help<br/>
2039: * prints a short list of possible commands.
2040: * </li>
2041: * <li>
2042: * <code>weka.core.Instances</code> <filename><br/>
2043: * prints a summary of a set of instances.
2044: * </li>
2045: * <li>
2046: * <code>weka.core.Instances</code> merge <filename1> <filename2><br/>
2047: * merges the two datasets (must have same number of instances) and
2048: * outputs the results on stdout.
2049: * </li>
2050: * <li>
2051: * <code>weka.core.Instances</code> append <filename1> <filename2><br/>
2052: * appends the second dataset to the first one (must have same headers) and
2053: * outputs the results on stdout.
2054: * </li>
2055: * <li>
2056: * <code>weka.core.Instances</code> randomize <seed> <filename><br/>
2057: * randomizes the dataset with the given seed and outputs the result on stdout.
2058: * </li>
2059: * </ul>
2060: *
2061: * @param args the commandline parameters
2062: */
2063: public static void main(String[] args) {
2064:
2065: try {
2066: Instances i;
2067: // read from stdin and print statistics
2068: if (args.length == 0) {
2069: DataSource source = new DataSource(System.in);
2070: i = source.getDataSet();
2071: System.out.println(i.toSummaryString());
2072: }
2073: // read file and print statistics
2074: else if ((args.length == 1) && (!args[0].equals("-h"))
2075: && (!args[0].equals("help"))) {
2076: DataSource source = new DataSource(args[0]);
2077: i = source.getDataSet();
2078: System.out.println(i.toSummaryString());
2079: }
2080: // read two files, merge them and print result to stdout
2081: else if ((args.length == 3)
2082: && (args[0].toLowerCase().equals("merge"))) {
2083: DataSource source1 = new DataSource(args[1]);
2084: DataSource source2 = new DataSource(args[2]);
2085: i = Instances.mergeInstances(source1.getDataSet(),
2086: source2.getDataSet());
2087: System.out.println(i);
2088: }
2089: // read two files, append them and print result to stdout
2090: else if ((args.length == 3)
2091: && (args[0].toLowerCase().equals("append"))) {
2092: DataSource source1 = new DataSource(args[1]);
2093: DataSource source2 = new DataSource(args[2]);
2094: if (!source1.getStructure().equalHeaders(
2095: source2.getStructure()))
2096: throw new Exception(
2097: "The two datasets have different headers!");
2098: Instances structure = source1.getStructure();
2099: System.out.println(source1.getStructure());
2100: while (source1.hasMoreElements(structure))
2101: System.out.println(source1.nextElement(structure));
2102: structure = source2.getStructure();
2103: while (source2.hasMoreElements(structure))
2104: System.out.println(source2.nextElement(structure));
2105: }
2106: // read file and seed value, randomize data and print result to stdout
2107: else if ((args.length == 3)
2108: && (args[0].toLowerCase().equals("randomize"))) {
2109: DataSource source = new DataSource(args[2]);
2110: i = source.getDataSet();
2111: i.randomize(new Random(Integer.parseInt(args[1])));
2112: System.out.println(i);
2113: }
2114: // wrong parameters
2115: else {
2116: System.err
2117: .println("\nUsage:\n"
2118: + "\tweka.core.Instances help\n"
2119: + "\tweka.core.Instances <filename>\n"
2120: + "\tweka.core.Instances merge <filename1> <filename2>\n"
2121: + "\tweka.core.Instances append <filename1> <filename2>\n"
2122: + "\tweka.core.Instances randomize <seed> <filename>\n");
2123: System.exit(1);
2124: }
2125: } catch (Exception ex) {
2126: ex.printStackTrace();
2127: System.err.println(ex.getMessage());
2128: }
2129: }
2130: }
|