Source Code Cross Referenced for Instances.java in  » Science » weka » weka » core » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Science » weka » weka.core 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /*
0002:         *    This program is free software; you can redistribute it and/or modify
0003:         *    it under the terms of the GNU General Public License as published by
0004:         *    the Free Software Foundation; either version 2 of the License, or
0005:         *    (at your option) any later version.
0006:         *
0007:         *    This program is distributed in the hope that it will be useful,
0008:         *    but WITHOUT ANY WARRANTY; without even the implied warranty of
0009:         *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0010:         *    GNU General Public License for more details.
0011:         *
0012:         *    You should have received a copy of the GNU General Public License
0013:         *    along with this program; if not, write to the Free Software
0014:         *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0015:         */
0016:
0017:        /*
0018:         *    Instances.java
0019:         *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
0020:         *
0021:         */
0022:
0023:        package weka.core;
0024:
0025:        import weka.core.converters.ArffLoader.ArffReader;
0026:        import weka.core.converters.ConverterUtils.DataSource;
0027:
0028:        import java.io.FileReader;
0029:        import java.io.IOException;
0030:        import java.io.Reader;
0031:        import java.io.Serializable;
0032:        import java.util.Enumeration;
0033:        import java.util.Random;
0034:
0035:        /**
0036:         * Class for handling an ordered set of weighted instances. <p>
0037:         *
0038:         * Typical usage: <p>
0039:         * <pre>
0040:         * import weka.core.converters.ConverterUtils.DataSource;
0041:         * ...
0042:         * 
0043:         * // Read all the instances in the file (ARFF, CSV, XRFF, ...)
0044:         * DataSource source = new DataSource(filename);
0045:         * Instances instances = source.getDataSet();
0046:         *
0047:         * // Make the last attribute be the class
0048:         * instances.setClassIndex(instances.numAttributes() - 1);
0049:         * 
0050:         * // Print header and instances.
0051:         * System.out.println("\nDataset:\n");
0052:         * System.out.println(instances);
0053:         * 
0054:         * ...
0055:         * </pre><p>
0056:         *
0057:         * All methods that change a set of instances are safe, ie. a change
0058:         * of a set of instances does not affect any other sets of
0059:         * instances. All methods that change a datasets's attribute
0060:         * information clone the dataset before it is changed.
0061:         *
0062:         * @author Eibe Frank (eibe@cs.waikato.ac.nz)
0063:         * @author Len Trigg (trigg@cs.waikato.ac.nz)
0064:         * @author FracPete (fracpete at waikato dot ac dot nz)
0065:         * @version $Revision: 1.72 $ 
0066:         */
0067:        public class Instances implements  Serializable {
0068:
0069:            /** for serialization */
0070:            static final long serialVersionUID = -19412345060742748L;
0071:
0072:            /** The filename extension that should be used for arff files */
0073:            public final static String FILE_EXTENSION = ".arff";
0074:
0075:            /** The filename extension that should be used for bin. serialized instances files */
0076:            public final static String SERIALIZED_OBJ_FILE_EXTENSION = ".bsi";
0077:
0078:            /** The keyword used to denote the start of an arff header */
0079:            public final static String ARFF_RELATION = "@relation";
0080:
0081:            /** The keyword used to denote the start of the arff data section */
0082:            public final static String ARFF_DATA = "@data";
0083:
0084:            /** The dataset's name. */
0085:            protected/*@spec_public non_null@*/String m_RelationName;
0086:
0087:            /** The attribute information. */
0088:            protected/*@spec_public non_null@*/FastVector m_Attributes;
0089:            /*  public invariant (\forall int i; 0 <= i && i < m_Attributes.size(); 
0090:                              m_Attributes.elementAt(i) != null);
0091:             */
0092:
0093:            /** The instances. */
0094:            protected/*@spec_public non_null@*/FastVector m_Instances;
0095:
0096:            /** The class attribute's index */
0097:            protected int m_ClassIndex;
0098:            //@ protected invariant classIndex() == m_ClassIndex;
0099:
0100:            /** The lines read so far in case of incremental loading. Since the 
0101:             * StreamTokenizer will be re-initialized with every instance that is read,
0102:             * we have to keep track of the number of lines read so far. 
0103:             * @see #readInstance(Reader) */
0104:            protected int m_Lines = 0;
0105:
0106:            /**
0107:             * Reads an ARFF file from a reader, and assigns a weight of
0108:             * one to each instance. Lets the index of the class 
0109:             * attribute be undefined (negative).
0110:             *
0111:             * @param reader the reader
0112:             * @throws IOException if the ARFF file is not read 
0113:             * successfully
0114:             */
0115:            public Instances(/*@non_null@*/Reader reader) throws IOException {
0116:                ArffReader arff = new ArffReader(reader);
0117:                Instances dataset = arff.getData();
0118:                initialize(dataset, dataset.numInstances());
0119:                dataset.copyInstances(0, this , dataset.numInstances());
0120:                compactify();
0121:            }
0122:
0123:            /**
0124:             * Reads the header of an ARFF file from a reader and 
0125:             * reserves space for the given number of instances. Lets
0126:             * the class index be undefined (negative).
0127:             *
0128:             * @param reader the reader
0129:             * @param capacity the capacity
0130:             * @throws IllegalArgumentException if the header is not read successfully
0131:             * or the capacity is negative.
0132:             * @throws IOException if there is a problem with the reader.
0133:             * @deprecated instead of using this method in conjunction with the
0134:             * <code>readInstance(Reader)</code> method, one should use the 
0135:             * <code>ArffLoader</code> or <code>DataSource</code> class instead.
0136:             * @see weka.core.converters.ArffLoader
0137:             * @see weka.core.converters.ConverterUtils.DataSource
0138:             */
0139:            //@ requires capacity >= 0;
0140:            //@ ensures classIndex() == -1;
0141:            @Deprecated
0142:            public Instances(/*@non_null@*/Reader reader, int capacity)
0143:                    throws IOException {
0144:
0145:                ArffReader arff = new ArffReader(reader, 0);
0146:                Instances header = arff.getStructure();
0147:                initialize(header, capacity);
0148:                m_Lines = arff.getLineNo();
0149:            }
0150:
0151:            /**
0152:             * Constructor copying all instances and references to
0153:             * the header information from the given set of instances.
0154:             *
0155:             * @param dataset the set to be copied
0156:             */
0157:            public Instances(/*@non_null@*/Instances dataset) {
0158:
0159:                this (dataset, dataset.numInstances());
0160:
0161:                dataset.copyInstances(0, this , dataset.numInstances());
0162:            }
0163:
0164:            /**
0165:             * Constructor creating an empty set of instances. Copies references
0166:             * to the header information from the given set of instances. Sets
0167:             * the capacity of the set of instances to 0 if its negative.
0168:             *
0169:             * @param dataset the instances from which the header 
0170:             * information is to be taken
0171:             * @param capacity the capacity of the new dataset 
0172:             */
0173:            public Instances(/*@non_null@*/Instances dataset, int capacity) {
0174:                initialize(dataset, capacity);
0175:            }
0176:
0177:            /**
0178:             * initializes with the header information of the given dataset and sets
0179:             * the capacity of the set of instances.
0180:             * 
0181:             * @param dataset the dataset to use as template
0182:             * @param capacity the number of rows to reserve
0183:             */
0184:            protected void initialize(Instances dataset, int capacity) {
0185:                if (capacity < 0)
0186:                    capacity = 0;
0187:
0188:                // Strings only have to be "shallow" copied because
0189:                // they can't be modified.
0190:                m_ClassIndex = dataset.m_ClassIndex;
0191:                m_RelationName = dataset.m_RelationName;
0192:                m_Attributes = dataset.m_Attributes;
0193:                m_Instances = new FastVector(capacity);
0194:            }
0195:
0196:            /**
0197:             * Creates a new set of instances by copying a 
0198:             * subset of another set.
0199:             *
0200:             * @param source the set of instances from which a subset 
0201:             * is to be created
0202:             * @param first the index of the first instance to be copied
0203:             * @param toCopy the number of instances to be copied
0204:             * @throws IllegalArgumentException if first and toCopy are out of range
0205:             */
0206:            //@ requires 0 <= first;
0207:            //@ requires 0 <= toCopy;
0208:            //@ requires first + toCopy <= source.numInstances();
0209:            public Instances(/*@non_null@*/Instances source, int first,
0210:                    int toCopy) {
0211:
0212:                this (source, toCopy);
0213:
0214:                if ((first < 0) || ((first + toCopy) > source.numInstances())) {
0215:                    throw new IllegalArgumentException(
0216:                            "Parameters first and/or toCopy out " + "of range");
0217:                }
0218:                source.copyInstances(first, this , toCopy);
0219:            }
0220:
0221:            /**
0222:             * Creates an empty set of instances. Uses the given
0223:             * attribute information. Sets the capacity of the set of 
0224:             * instances to 0 if its negative. Given attribute information
0225:             * must not be changed after this constructor has been used.
0226:             *
0227:             * @param name the name of the relation
0228:             * @param attInfo the attribute information
0229:             * @param capacity the capacity of the set
0230:             */
0231:            public Instances(/*@non_null@*/String name,
0232:            /*@non_null@*/FastVector attInfo, int capacity) {
0233:
0234:                m_RelationName = name;
0235:                m_ClassIndex = -1;
0236:                m_Attributes = attInfo;
0237:                for (int i = 0; i < numAttributes(); i++) {
0238:                    attribute(i).setIndex(i);
0239:                }
0240:                m_Instances = new FastVector(capacity);
0241:            }
0242:
0243:            /**
0244:             * Create a copy of the structure, but "cleanse" string types (i.e.
0245:             * doesn't contain references to the strings seen in the past).
0246:             * Also cleanses all relational attributes.
0247:             *
0248:             * @return a copy of the instance structure.
0249:             */
0250:            public Instances stringFreeStructure() {
0251:
0252:                FastVector atts = (FastVector) m_Attributes.copy();
0253:                for (int i = 0; i < atts.size(); i++) {
0254:                    Attribute att = (Attribute) atts.elementAt(i);
0255:                    if (att.type() == Attribute.STRING) {
0256:                        atts.setElementAt(new Attribute(att.name(),
0257:                                (FastVector) null), i);
0258:                    } else if (att.type() == Attribute.RELATIONAL) {
0259:                        atts.setElementAt(new Attribute(att.name(),
0260:                                new Instances(att.relation(), 0)), i);
0261:                    }
0262:                }
0263:                Instances result = new Instances(relationName(), atts, 0);
0264:                result.m_ClassIndex = m_ClassIndex;
0265:                return result;
0266:            }
0267:
0268:            /**
0269:             * Adds one instance to the end of the set. 
0270:             * Shallow copies instance before it is added. Increases the
0271:             * size of the dataset if it is not large enough. Does not
0272:             * check if the instance is compatible with the dataset.
0273:             * Note: String or relational values are not transferred.
0274:             *
0275:             * @param instance the instance to be added
0276:             */
0277:            public void add(/*@non_null@*/Instance instance) {
0278:
0279:                Instance newInstance = (Instance) instance.copy();
0280:
0281:                newInstance.setDataset(this );
0282:                m_Instances.addElement(newInstance);
0283:            }
0284:
0285:            /**
0286:             * Returns an attribute.
0287:             *
0288:             * @param index the attribute's index (index starts with 0)
0289:             * @return the attribute at the given position
0290:             */
0291:            //@ requires 0 <= index;
0292:            //@ requires index < m_Attributes.size();
0293:            //@ ensures \result != null;
0294:            public/*@pure@*/Attribute attribute(int index) {
0295:
0296:                return (Attribute) m_Attributes.elementAt(index);
0297:            }
0298:
0299:            /**
0300:             * Returns an attribute given its name. If there is more than
0301:             * one attribute with the same name, it returns the first one.
0302:             * Returns null if the attribute can't be found.
0303:             *
0304:             * @param name the attribute's name
0305:             * @return the attribute with the given name, null if the
0306:             * attribute can't be found
0307:             */
0308:            public/*@pure@*/Attribute attribute(String name) {
0309:
0310:                for (int i = 0; i < numAttributes(); i++) {
0311:                    if (attribute(i).name().equals(name)) {
0312:                        return attribute(i);
0313:                    }
0314:                }
0315:                return null;
0316:            }
0317:
0318:            /**
0319:             * Checks for attributes of the given type in the dataset
0320:             *
0321:             * @param attType  the attribute type to look for
0322:             * @return         true if attributes of the given type are present
0323:             */
0324:            public boolean checkForAttributeType(int attType) {
0325:
0326:                int i = 0;
0327:
0328:                while (i < m_Attributes.size()) {
0329:                    if (attribute(i++).type() == attType) {
0330:                        return true;
0331:                    }
0332:                }
0333:                return false;
0334:            }
0335:
0336:            /**
0337:             * Checks for string attributes in the dataset
0338:             *
0339:             * @return true if string attributes are present, false otherwise
0340:             */
0341:            public/*@pure@*/boolean checkForStringAttributes() {
0342:                return checkForAttributeType(Attribute.STRING);
0343:            }
0344:
0345:            /**
0346:             * Checks if the given instance is compatible
0347:             * with this dataset. Only looks at the size of
0348:             * the instance and the ranges of the values for 
0349:             * nominal and string attributes.
0350:             *
0351:             * @param instance the instance to check
0352:             * @return true if the instance is compatible with the dataset 
0353:             */
0354:            public/*@pure@*/boolean checkInstance(Instance instance) {
0355:
0356:                if (instance.numAttributes() != numAttributes()) {
0357:                    return false;
0358:                }
0359:                for (int i = 0; i < numAttributes(); i++) {
0360:                    if (instance.isMissing(i)) {
0361:                        continue;
0362:                    } else if (attribute(i).isNominal()
0363:                            || attribute(i).isString()) {
0364:                        if (!(Utils.eq(instance.value(i),
0365:                                (double) (int) instance.value(i)))) {
0366:                            return false;
0367:                        } else if (Utils.sm(instance.value(i), 0)
0368:                                || Utils.gr(instance.value(i), attribute(i)
0369:                                        .numValues())) {
0370:                            return false;
0371:                        }
0372:                    }
0373:                }
0374:                return true;
0375:            }
0376:
0377:            /**
0378:             * Returns the class attribute.
0379:             *
0380:             * @return the class attribute
0381:             * @throws UnassignedClassException if the class is not set
0382:             */
0383:            //@ requires classIndex() >= 0;
0384:            public/*@pure@*/Attribute classAttribute() {
0385:
0386:                if (m_ClassIndex < 0) {
0387:                    throw new UnassignedClassException(
0388:                            "Class index is negative (not set)!");
0389:                }
0390:                return attribute(m_ClassIndex);
0391:            }
0392:
0393:            /**
0394:             * Returns the class attribute's index. Returns negative number
0395:             * if it's undefined.
0396:             *
0397:             * @return the class index as an integer
0398:             */
0399:            // ensures \result == m_ClassIndex;
0400:            public/*@pure@*/int classIndex() {
0401:
0402:                return m_ClassIndex;
0403:            }
0404:
0405:            /**
0406:             * Compactifies the set of instances. Decreases the capacity of
0407:             * the set so that it matches the number of instances in the set.
0408:             */
0409:            public void compactify() {
0410:
0411:                m_Instances.trimToSize();
0412:            }
0413:
0414:            /**
0415:             * Removes all instances from the set.
0416:             */
0417:            public void delete() {
0418:
0419:                m_Instances = new FastVector();
0420:            }
0421:
0422:            /**
0423:             * Removes an instance at the given position from the set.
0424:             *
0425:             * @param index the instance's position (index starts with 0)
0426:             */
0427:            //@ requires 0 <= index && index < numInstances();
0428:            public void delete(int index) {
0429:
0430:                m_Instances.removeElementAt(index);
0431:            }
0432:
0433:            /**
0434:             * Deletes an attribute at the given position 
0435:             * (0 to numAttributes() - 1). A deep copy of the attribute
0436:             * information is performed before the attribute is deleted.
0437:             *
0438:             * @param position the attribute's position (position starts with 0)
0439:             * @throws IllegalArgumentException if the given index is out of range 
0440:             *            or the class attribute is being deleted
0441:             */
0442:            //@ requires 0 <= position && position < numAttributes();
0443:            //@ requires position != classIndex();
0444:            public void deleteAttributeAt(int position) {
0445:
0446:                if ((position < 0) || (position >= m_Attributes.size())) {
0447:                    throw new IllegalArgumentException("Index out of range");
0448:                }
0449:                if (position == m_ClassIndex) {
0450:                    throw new IllegalArgumentException(
0451:                            "Can't delete class attribute");
0452:                }
0453:                freshAttributeInfo();
0454:                if (m_ClassIndex > position) {
0455:                    m_ClassIndex--;
0456:                }
0457:                m_Attributes.removeElementAt(position);
0458:                for (int i = position; i < m_Attributes.size(); i++) {
0459:                    Attribute current = (Attribute) m_Attributes.elementAt(i);
0460:                    current.setIndex(current.index() - 1);
0461:                }
0462:                for (int i = 0; i < numInstances(); i++) {
0463:                    instance(i).forceDeleteAttributeAt(position);
0464:                }
0465:            }
0466:
0467:            /**
0468:             * Deletes all attributes of the given type in the dataset. A deep copy of 
0469:             * the attribute information is performed before an attribute is deleted.
0470:             *
0471:             * @param attType the attribute type to delete
0472:             * @throws IllegalArgumentException if attribute couldn't be 
0473:             * successfully deleted (probably because it is the class attribute).
0474:             */
0475:            public void deleteAttributeType(int attType) {
0476:                int i = 0;
0477:                while (i < m_Attributes.size()) {
0478:                    if (attribute(i).type() == attType) {
0479:                        deleteAttributeAt(i);
0480:                    } else {
0481:                        i++;
0482:                    }
0483:                }
0484:            }
0485:
0486:            /**
0487:             * Deletes all string attributes in the dataset. A deep copy of the attribute
0488:             * information is performed before an attribute is deleted.
0489:             *
0490:             * @throws IllegalArgumentException if string attribute couldn't be 
0491:             * successfully deleted (probably because it is the class attribute).
0492:             * @see #deleteAttributeType(int)
0493:             */
0494:            public void deleteStringAttributes() {
0495:                deleteAttributeType(Attribute.STRING);
0496:            }
0497:
0498:            /**
0499:             * Removes all instances with missing values for a particular
0500:             * attribute from the dataset.
0501:             *
0502:             * @param attIndex the attribute's index (index starts with 0)
0503:             */
0504:            //@ requires 0 <= attIndex && attIndex < numAttributes();
0505:            public void deleteWithMissing(int attIndex) {
0506:
0507:                FastVector newInstances = new FastVector(numInstances());
0508:
0509:                for (int i = 0; i < numInstances(); i++) {
0510:                    if (!instance(i).isMissing(attIndex)) {
0511:                        newInstances.addElement(instance(i));
0512:                    }
0513:                }
0514:                m_Instances = newInstances;
0515:            }
0516:
0517:            /**
0518:             * Removes all instances with missing values for a particular
0519:             * attribute from the dataset.
0520:             *
0521:             * @param att the attribute
0522:             */
0523:            public void deleteWithMissing(/*@non_null@*/Attribute att) {
0524:
0525:                deleteWithMissing(att.index());
0526:            }
0527:
0528:            /**
0529:             * Removes all instances with a missing class value
0530:             * from the dataset.
0531:             *
0532:             * @throws UnassignedClassException if class is not set
0533:             */
0534:            public void deleteWithMissingClass() {
0535:
0536:                if (m_ClassIndex < 0) {
0537:                    throw new UnassignedClassException(
0538:                            "Class index is negative (not set)!");
0539:                }
0540:                deleteWithMissing(m_ClassIndex);
0541:            }
0542:
0543:            /**
0544:             * Returns an enumeration of all the attributes.
0545:             *
0546:             * @return enumeration of all the attributes.
0547:             */
0548:            public/*@non_null pure@*/Enumeration enumerateAttributes() {
0549:
0550:                return m_Attributes.elements(m_ClassIndex);
0551:            }
0552:
0553:            /**
0554:             * Returns an enumeration of all instances in the dataset.
0555:             *
0556:             * @return enumeration of all instances in the dataset
0557:             */
0558:            public/*@non_null pure@*/Enumeration enumerateInstances() {
0559:
0560:                return m_Instances.elements();
0561:            }
0562:
0563:            /**
0564:             * Checks if two headers are equivalent.
0565:             *
0566:             * @param dataset another dataset
0567:             * @return true if the header of the given dataset is equivalent 
0568:             * to this header
0569:             */
0570:            public/*@pure@*/boolean equalHeaders(Instances dataset) {
0571:
0572:                // Check class and all attributes
0573:                if (m_ClassIndex != dataset.m_ClassIndex) {
0574:                    return false;
0575:                }
0576:                if (m_Attributes.size() != dataset.m_Attributes.size()) {
0577:                    return false;
0578:                }
0579:                for (int i = 0; i < m_Attributes.size(); i++) {
0580:                    if (!(attribute(i).equals(dataset.attribute(i)))) {
0581:                        return false;
0582:                    }
0583:                }
0584:                return true;
0585:            }
0586:
0587:            /**
0588:             * Returns the first instance in the set.
0589:             *
0590:             * @return the first instance in the set
0591:             */
0592:            //@ requires numInstances() > 0;
0593:            public/*@non_null pure@*/Instance firstInstance() {
0594:
0595:                return (Instance) m_Instances.firstElement();
0596:            }
0597:
0598:            /**
0599:             * Returns a random number generator. The initial seed of the random
0600:             * number generator depends on the given seed and the hash code of
0601:             * a string representation of a instances chosen based on the given
0602:             * seed. 
0603:             *
0604:             * @param seed the given seed
0605:             * @return the random number generator
0606:             */
0607:            public Random getRandomNumberGenerator(long seed) {
0608:
0609:                Random r = new Random(seed);
0610:                r.setSeed(instance(r.nextInt(numInstances())).toString()
0611:                        .hashCode()
0612:                        + seed);
0613:                return r;
0614:            }
0615:
0616:            /**
0617:             * Inserts an attribute at the given position (0 to 
0618:             * numAttributes()) and sets all values to be missing.
0619:             * Shallow copies the attribute before it is inserted, and performs
0620:             * a deep copy of the existing attribute information.
0621:             *
0622:             * @param att the attribute to be inserted
0623:             * @param position the attribute's position (position starts with 0)
0624:             * @throws IllegalArgumentException if the given index is out of range
0625:             */
0626:            //@ requires 0 <= position;
0627:            //@ requires position <= numAttributes();
0628:            public void insertAttributeAt(/*@non_null@*/Attribute att,
0629:                    int position) {
0630:
0631:                if ((position < 0) || (position > m_Attributes.size())) {
0632:                    throw new IllegalArgumentException("Index out of range");
0633:                }
0634:                att = (Attribute) att.copy();
0635:                freshAttributeInfo();
0636:                att.setIndex(position);
0637:                m_Attributes.insertElementAt(att, position);
0638:                for (int i = position + 1; i < m_Attributes.size(); i++) {
0639:                    Attribute current = (Attribute) m_Attributes.elementAt(i);
0640:                    current.setIndex(current.index() + 1);
0641:                }
0642:                for (int i = 0; i < numInstances(); i++) {
0643:                    instance(i).forceInsertAttributeAt(position);
0644:                }
0645:                if (m_ClassIndex >= position) {
0646:                    m_ClassIndex++;
0647:                }
0648:            }
0649:
0650:            /**
0651:             * Returns the instance at the given position.
0652:             *
0653:             * @param index the instance's index (index starts with 0)
0654:             * @return the instance at the given position
0655:             */
0656:            //@ requires 0 <= index;
0657:            //@ requires index < numInstances();
0658:            public/*@non_null pure@*/Instance instance(int index) {
0659:
0660:                return (Instance) m_Instances.elementAt(index);
0661:            }
0662:
0663:            /**
0664:             * Returns the kth-smallest attribute value of a numeric attribute.
0665:             * Note that calling this method will change the order of the data!
0666:             *
0667:             * @param att the Attribute object
0668:             * @param k the value of k
0669:             * @return the kth-smallest value
0670:             */
0671:            public double kthSmallestValue(Attribute att, int k) {
0672:
0673:                return kthSmallestValue(att.index(), k);
0674:            }
0675:
0676:            /**
0677:             * Returns the kth-smallest attribute value of a numeric attribute.
0678:             * Note that calling this method will change the order of the data!
0679:             * The number of non-missing values in the data must be as least
0680:             * as last as k for this to work.
0681:             *
0682:             * @param attIndex the attribute's index
0683:             * @param k the value of k
0684:             * @return the kth-smallest value
0685:             */
0686:            public double kthSmallestValue(int attIndex, int k) {
0687:
0688:                if (!attribute(attIndex).isNumeric()) {
0689:                    throw new IllegalArgumentException(
0690:                            "Instances: attribute must be numeric to compute kth-smallest value.");
0691:                }
0692:
0693:                int i, j;
0694:
0695:                // move all instances with missing values to end
0696:                j = numInstances() - 1;
0697:                i = 0;
0698:                while (i <= j) {
0699:                    if (instance(j).isMissing(attIndex)) {
0700:                        j--;
0701:                    } else {
0702:                        if (instance(i).isMissing(attIndex)) {
0703:                            swap(i, j);
0704:                            j--;
0705:                        }
0706:                        i++;
0707:                    }
0708:                }
0709:
0710:                if ((k < 0) || (k > j)) {
0711:                    throw new IllegalArgumentException(
0712:                            "Instances: value for k for computing kth-smallest value too large.");
0713:                }
0714:
0715:                return instance(select(attIndex, 0, j, k)).value(attIndex);
0716:            }
0717:
0718:            /**
0719:             * Returns the last instance in the set.
0720:             *
0721:             * @return the last instance in the set
0722:             */
0723:            //@ requires numInstances() > 0;
0724:            public/*@non_null pure@*/Instance lastInstance() {
0725:
0726:                return (Instance) m_Instances.lastElement();
0727:            }
0728:
0729:            /**
0730:             * Returns the mean (mode) for a numeric (nominal) attribute as
0731:             * a floating-point value. Returns 0 if the attribute is neither nominal nor 
0732:             * numeric. If all values are missing it returns zero.
0733:             *
0734:             * @param attIndex the attribute's index (index starts with 0)
0735:             * @return the mean or the mode
0736:             */
0737:            public/*@pure@*/double meanOrMode(int attIndex) {
0738:
0739:                double result, found;
0740:                int[] counts;
0741:
0742:                if (attribute(attIndex).isNumeric()) {
0743:                    result = found = 0;
0744:                    for (int j = 0; j < numInstances(); j++) {
0745:                        if (!instance(j).isMissing(attIndex)) {
0746:                            found += instance(j).weight();
0747:                            result += instance(j).weight()
0748:                                    * instance(j).value(attIndex);
0749:                        }
0750:                    }
0751:                    if (found <= 0) {
0752:                        return 0;
0753:                    } else {
0754:                        return result / found;
0755:                    }
0756:                } else if (attribute(attIndex).isNominal()) {
0757:                    counts = new int[attribute(attIndex).numValues()];
0758:                    for (int j = 0; j < numInstances(); j++) {
0759:                        if (!instance(j).isMissing(attIndex)) {
0760:                            counts[(int) instance(j).value(attIndex)] += instance(
0761:                                    j).weight();
0762:                        }
0763:                    }
0764:                    return (double) Utils.maxIndex(counts);
0765:                } else {
0766:                    return 0;
0767:                }
0768:            }
0769:
0770:            /**
0771:             * Returns the mean (mode) for a numeric (nominal) attribute as a
0772:             * floating-point value.  Returns 0 if the attribute is neither
0773:             * nominal nor numeric.  If all values are missing it returns zero.
0774:             *
0775:             * @param att the attribute
0776:             * @return the mean or the mode 
0777:             */
0778:            public/*@pure@*/double meanOrMode(Attribute att) {
0779:
0780:                return meanOrMode(att.index());
0781:            }
0782:
0783:            /**
0784:             * Returns the number of attributes.
0785:             *
0786:             * @return the number of attributes as an integer
0787:             */
0788:            //@ ensures \result == m_Attributes.size();
0789:            public/*@pure@*/int numAttributes() {
0790:
0791:                return m_Attributes.size();
0792:            }
0793:
0794:            /**
0795:             * Returns the number of class labels.
0796:             *
0797:             * @return the number of class labels as an integer if the class 
0798:             * attribute is nominal, 1 otherwise.
0799:             * @throws UnassignedClassException if the class is not set
0800:             */
0801:            //@ requires classIndex() >= 0;
0802:            public/*@pure@*/int numClasses() {
0803:
0804:                if (m_ClassIndex < 0) {
0805:                    throw new UnassignedClassException(
0806:                            "Class index is negative (not set)!");
0807:                }
0808:                if (!classAttribute().isNominal()) {
0809:                    return 1;
0810:                } else {
0811:                    return classAttribute().numValues();
0812:                }
0813:            }
0814:
0815:            /**
0816:             * Returns the number of distinct values of a given attribute.
0817:             * Returns the number of instances if the attribute is a
0818:             * string attribute. The value 'missing' is not counted.
0819:             *
0820:             * @param attIndex the attribute (index starts with 0)
0821:             * @return the number of distinct values of a given attribute
0822:             */
0823:            //@ requires 0 <= attIndex;
0824:            //@ requires attIndex < numAttributes();
0825:            public/*@pure@*/int numDistinctValues(int attIndex) {
0826:
0827:                if (attribute(attIndex).isNumeric()) {
0828:                    double[] attVals = attributeToDoubleArray(attIndex);
0829:                    int[] sorted = Utils.sort(attVals);
0830:                    double prev = 0;
0831:                    int counter = 0;
0832:                    for (int i = 0; i < sorted.length; i++) {
0833:                        Instance current = instance(sorted[i]);
0834:                        if (current.isMissing(attIndex)) {
0835:                            break;
0836:                        }
0837:                        if ((i == 0) || (current.value(attIndex) > prev)) {
0838:                            prev = current.value(attIndex);
0839:                            counter++;
0840:                        }
0841:                    }
0842:                    return counter;
0843:                } else {
0844:                    return attribute(attIndex).numValues();
0845:                }
0846:            }
0847:
0848:            /**
0849:             * Returns the number of distinct values of a given attribute.
0850:             * Returns the number of instances if the attribute is a
0851:             * string attribute. The value 'missing' is not counted.
0852:             *
0853:             * @param att the attribute
0854:             * @return the number of distinct values of a given attribute
0855:             */
0856:            public/*@pure@*/int numDistinctValues(/*@non_null@*/Attribute att) {
0857:
0858:                return numDistinctValues(att.index());
0859:            }
0860:
0861:            /**
0862:             * Returns the number of instances in the dataset.
0863:             *
0864:             * @return the number of instances in the dataset as an integer
0865:             */
0866:            //@ ensures \result == m_Instances.size();
0867:            public/*@pure@*/int numInstances() {
0868:
0869:                return m_Instances.size();
0870:            }
0871:
0872:            /**
0873:             * Shuffles the instances in the set so that they are ordered 
0874:             * randomly.
0875:             *
0876:             * @param random a random number generator
0877:             */
0878:            public void randomize(Random random) {
0879:
0880:                for (int j = numInstances() - 1; j > 0; j--)
0881:                    swap(j, random.nextInt(j + 1));
0882:            }
0883:
0884:            /**
0885:             * Reads a single instance from the reader and appends it
0886:             * to the dataset.  Automatically expands the dataset if it
0887:             * is not large enough to hold the instance. This method does
0888:             * not check for carriage return at the end of the line.
0889:             *
0890:             * @param reader the reader 
0891:             * @return false if end of file has been reached
0892:             * @throws IOException if the information is not read 
0893:             * successfully
0894:             * @deprecated instead of using this method in conjunction with the
0895:             * <code>readInstance(Reader)</code> method, one should use the 
0896:             * <code>ArffLoader</code> or <code>DataSource</code> class instead.
0897:             * @see weka.core.converters.ArffLoader
0898:             * @see weka.core.converters.ConverterUtils.DataSource
0899:             */
0900:            @Deprecated
0901:            public boolean readInstance(Reader reader) throws IOException {
0902:
0903:                ArffReader arff = new ArffReader(reader, this , m_Lines, 1);
0904:                Instance inst = arff.readInstance(arff.getData(), false);
0905:                m_Lines = arff.getLineNo();
0906:                if (inst != null) {
0907:                    add(inst);
0908:                    return true;
0909:                } else {
0910:                    return false;
0911:                }
0912:            }
0913:
0914:            /**
0915:             * Returns the relation's name.
0916:             *
0917:             * @return the relation's name as a string
0918:             */
0919:            //@ ensures \result == m_RelationName;
0920:            public/*@pure@*/String relationName() {
0921:
0922:                return m_RelationName;
0923:            }
0924:
0925:            /**
0926:             * Renames an attribute. This change only affects this
0927:             * dataset.
0928:             *
0929:             * @param att the attribute's index (index starts with 0)
0930:             * @param name the new name
0931:             */
0932:            public void renameAttribute(int att, String name) {
0933:
0934:                Attribute newAtt = attribute(att).copy(name);
0935:                FastVector newVec = new FastVector(numAttributes());
0936:
0937:                for (int i = 0; i < numAttributes(); i++) {
0938:                    if (i == att) {
0939:                        newVec.addElement(newAtt);
0940:                    } else {
0941:                        newVec.addElement(attribute(i));
0942:                    }
0943:                }
0944:                m_Attributes = newVec;
0945:            }
0946:
0947:            /**
0948:             * Renames an attribute. This change only affects this
0949:             * dataset.
0950:             *
0951:             * @param att the attribute
0952:             * @param name the new name
0953:             */
0954:            public void renameAttribute(Attribute att, String name) {
0955:
0956:                renameAttribute(att.index(), name);
0957:            }
0958:
0959:            /**
0960:             * Renames the value of a nominal (or string) attribute value. This
0961:             * change only affects this dataset.
0962:             *
0963:             * @param att the attribute's index (index starts with 0)
0964:             * @param val the value's index (index starts with 0)
0965:             * @param name the new name 
0966:             */
0967:            public void renameAttributeValue(int att, int val, String name) {
0968:
0969:                Attribute newAtt = (Attribute) attribute(att).copy();
0970:                FastVector newVec = new FastVector(numAttributes());
0971:
0972:                newAtt.setValue(val, name);
0973:                for (int i = 0; i < numAttributes(); i++) {
0974:                    if (i == att) {
0975:                        newVec.addElement(newAtt);
0976:                    } else {
0977:                        newVec.addElement(attribute(i));
0978:                    }
0979:                }
0980:                m_Attributes = newVec;
0981:            }
0982:
0983:            /**
0984:             * Renames the value of a nominal (or string) attribute value. This
0985:             * change only affects this dataset.
0986:             *
0987:             * @param att the attribute
0988:             * @param val the value
0989:             * @param name the new name
0990:             */
0991:            public void renameAttributeValue(Attribute att, String val,
0992:                    String name) {
0993:
0994:                int v = att.indexOfValue(val);
0995:                if (v == -1)
0996:                    throw new IllegalArgumentException(val + " not found");
0997:                renameAttributeValue(att.index(), v, name);
0998:            }
0999:
1000:            /**
1001:             * Creates a new dataset of the same size using random sampling
1002:             * with replacement.
1003:             *
1004:             * @param random a random number generator
1005:             * @return the new dataset
1006:             */
1007:            public Instances resample(Random random) {
1008:
1009:                Instances newData = new Instances(this , numInstances());
1010:                while (newData.numInstances() < numInstances()) {
1011:                    newData.add(instance(random.nextInt(numInstances())));
1012:                }
1013:                return newData;
1014:            }
1015:
1016:            /**
1017:             * Creates a new dataset of the same size using random sampling
1018:             * with replacement according to the current instance weights. The
1019:             * weights of the instances in the new dataset are set to one.
1020:             *
1021:             * @param random a random number generator
1022:             * @return the new dataset
1023:             */
1024:            public Instances resampleWithWeights(Random random) {
1025:
1026:                double[] weights = new double[numInstances()];
1027:                for (int i = 0; i < weights.length; i++) {
1028:                    weights[i] = instance(i).weight();
1029:                }
1030:                return resampleWithWeights(random, weights);
1031:            }
1032:
1033:            /**
1034:             * Creates a new dataset of the same size using random sampling
1035:             * with replacement according to the given weight vector. The
1036:             * weights of the instances in the new dataset are set to one.
1037:             * The length of the weight vector has to be the same as the
1038:             * number of instances in the dataset, and all weights have to
1039:             * be positive.
1040:             *
1041:             * @param random a random number generator
1042:             * @param weights the weight vector
1043:             * @return the new dataset
1044:             * @throws IllegalArgumentException if the weights array is of the wrong
1045:             * length or contains negative weights.
1046:             */
1047:            public Instances resampleWithWeights(Random random, double[] weights) {
1048:
1049:                if (weights.length != numInstances()) {
1050:                    throw new IllegalArgumentException(
1051:                            "weights.length != numInstances.");
1052:                }
1053:                Instances newData = new Instances(this , numInstances());
1054:                if (numInstances() == 0) {
1055:                    return newData;
1056:                }
1057:                double[] probabilities = new double[numInstances()];
1058:                double sumProbs = 0, sumOfWeights = Utils.sum(weights);
1059:                for (int i = 0; i < numInstances(); i++) {
1060:                    sumProbs += random.nextDouble();
1061:                    probabilities[i] = sumProbs;
1062:                }
1063:                Utils.normalize(probabilities, sumProbs / sumOfWeights);
1064:
1065:                // Make sure that rounding errors don't mess things up
1066:                probabilities[numInstances() - 1] = sumOfWeights;
1067:                int k = 0;
1068:                int l = 0;
1069:                sumProbs = 0;
1070:                while ((k < numInstances() && (l < numInstances()))) {
1071:                    if (weights[l] < 0) {
1072:                        throw new IllegalArgumentException(
1073:                                "Weights have to be positive.");
1074:                    }
1075:                    sumProbs += weights[l];
1076:                    while ((k < numInstances())
1077:                            && (probabilities[k] <= sumProbs)) {
1078:                        newData.add(instance(l));
1079:                        newData.instance(k).setWeight(1);
1080:                        k++;
1081:                    }
1082:                    l++;
1083:                }
1084:                return newData;
1085:            }
1086:
1087:            /** 
1088:             * Sets the class attribute.
1089:             *
1090:             * @param att attribute to be the class
1091:             */
1092:            public void setClass(Attribute att) {
1093:
1094:                m_ClassIndex = att.index();
1095:            }
1096:
1097:            /** 
1098:             * Sets the class index of the set.
1099:             * If the class index is negative there is assumed to be no class.
1100:             * (ie. it is undefined)
1101:             *
1102:             * @param classIndex the new class index (index starts with 0)
1103:             * @throws IllegalArgumentException if the class index is too big or < 0
1104:             */
1105:            public void setClassIndex(int classIndex) {
1106:
1107:                if (classIndex >= numAttributes()) {
1108:                    throw new IllegalArgumentException("Invalid class index: "
1109:                            + classIndex);
1110:                }
1111:                m_ClassIndex = classIndex;
1112:            }
1113:
1114:            /**
1115:             * Sets the relation's name.
1116:             *
1117:             * @param newName the new relation name.
1118:             */
1119:            public void setRelationName(/*@non_null@*/String newName) {
1120:
1121:                m_RelationName = newName;
1122:            }
1123:
1124:            /**
1125:             * Sorts the instances based on an attribute. For numeric attributes, 
1126:             * instances are sorted in ascending order. For nominal attributes, 
1127:             * instances are sorted based on the attribute label ordering 
1128:             * specified in the header. Instances with missing values for the 
1129:             * attribute are placed at the end of the dataset.
1130:             *
1131:             * @param attIndex the attribute's index (index starts with 0)
1132:             */
1133:            public void sort(int attIndex) {
1134:
1135:                int i, j;
1136:
1137:                // move all instances with missing values to end
1138:                j = numInstances() - 1;
1139:                i = 0;
1140:                while (i <= j) {
1141:                    if (instance(j).isMissing(attIndex)) {
1142:                        j--;
1143:                    } else {
1144:                        if (instance(i).isMissing(attIndex)) {
1145:                            swap(i, j);
1146:                            j--;
1147:                        }
1148:                        i++;
1149:                    }
1150:                }
1151:                quickSort(attIndex, 0, j);
1152:            }
1153:
1154:            /**
1155:             * Sorts the instances based on an attribute. For numeric attributes, 
1156:             * instances are sorted into ascending order. For nominal attributes, 
1157:             * instances are sorted based on the attribute label ordering 
1158:             * specified in the header. Instances with missing values for the 
1159:             * attribute are placed at the end of the dataset.
1160:             *
1161:             * @param att the attribute
1162:             */
1163:            public void sort(Attribute att) {
1164:
1165:                sort(att.index());
1166:            }
1167:
1168:            /**
1169:             * Stratifies a set of instances according to its class values 
1170:             * if the class attribute is nominal (so that afterwards a 
1171:             * stratified cross-validation can be performed).
1172:             *
1173:             * @param numFolds the number of folds in the cross-validation
1174:             * @throws UnassignedClassException if the class is not set
1175:             */
1176:            public void stratify(int numFolds) {
1177:
1178:                if (numFolds <= 0) {
1179:                    throw new IllegalArgumentException(
1180:                            "Number of folds must be greater than 1");
1181:                }
1182:                if (m_ClassIndex < 0) {
1183:                    throw new UnassignedClassException(
1184:                            "Class index is negative (not set)!");
1185:                }
1186:                if (classAttribute().isNominal()) {
1187:
1188:                    // sort by class
1189:                    int index = 1;
1190:                    while (index < numInstances()) {
1191:                        Instance instance1 = instance(index - 1);
1192:                        for (int j = index; j < numInstances(); j++) {
1193:                            Instance instance2 = instance(j);
1194:                            if ((instance1.classValue() == instance2
1195:                                    .classValue())
1196:                                    || (instance1.classIsMissing() && instance2
1197:                                            .classIsMissing())) {
1198:                                swap(index, j);
1199:                                index++;
1200:                            }
1201:                        }
1202:                        index++;
1203:                    }
1204:                    stratStep(numFolds);
1205:                }
1206:            }
1207:
1208:            /**
1209:             * Computes the sum of all the instances' weights.
1210:             *
1211:             * @return the sum of all the instances' weights as a double
1212:             */
1213:            public/*@pure@*/double sumOfWeights() {
1214:
1215:                double sum = 0;
1216:
1217:                for (int i = 0; i < numInstances(); i++) {
1218:                    sum += instance(i).weight();
1219:                }
1220:                return sum;
1221:            }
1222:
1223:            /**
1224:             * Creates the test set for one fold of a cross-validation on 
1225:             * the dataset.
1226:             *
1227:             * @param numFolds the number of folds in the cross-validation. Must
1228:             * be greater than 1.
1229:             * @param numFold 0 for the first fold, 1 for the second, ...
1230:             * @return the test set as a set of weighted instances
1231:             * @throws IllegalArgumentException if the number of folds is less than 2
1232:             * or greater than the number of instances.
1233:             */
1234:            //@ requires 2 <= numFolds && numFolds < numInstances();
1235:            //@ requires 0 <= numFold && numFold < numFolds;
1236:            public Instances testCV(int numFolds, int numFold) {
1237:
1238:                int numInstForFold, first, offset;
1239:                Instances test;
1240:
1241:                if (numFolds < 2) {
1242:                    throw new IllegalArgumentException(
1243:                            "Number of folds must be at least 2!");
1244:                }
1245:                if (numFolds > numInstances()) {
1246:                    throw new IllegalArgumentException(
1247:                            "Can't have more folds than instances!");
1248:                }
1249:                numInstForFold = numInstances() / numFolds;
1250:                if (numFold < numInstances() % numFolds) {
1251:                    numInstForFold++;
1252:                    offset = numFold;
1253:                } else
1254:                    offset = numInstances() % numFolds;
1255:                test = new Instances(this , numInstForFold);
1256:                first = numFold * (numInstances() / numFolds) + offset;
1257:                copyInstances(first, test, numInstForFold);
1258:                return test;
1259:            }
1260:
1261:            /**
1262:             * Returns the dataset as a string in ARFF format. Strings
1263:             * are quoted if they contain whitespace characters, or if they
1264:             * are a question mark.
1265:             *
1266:             * @return the dataset in ARFF format as a string
1267:             */
1268:            public String toString() {
1269:
1270:                StringBuffer text = new StringBuffer();
1271:
1272:                text.append(ARFF_RELATION).append(" ").append(
1273:                        Utils.quote(m_RelationName)).append("\n\n");
1274:                for (int i = 0; i < numAttributes(); i++) {
1275:                    text.append(attribute(i)).append("\n");
1276:                }
1277:                text.append("\n").append(ARFF_DATA).append("\n");
1278:
1279:                text.append(stringWithoutHeader());
1280:                return text.toString();
1281:            }
1282:
1283:            /**
1284:             * Returns the instances in the dataset as a string in ARFF format. Strings
1285:             * are quoted if they contain whitespace characters, or if they
1286:             * are a question mark.
1287:             *
1288:             * @return the dataset in ARFF format as a string
1289:             */
1290:            protected String stringWithoutHeader() {
1291:
1292:                StringBuffer text = new StringBuffer();
1293:
1294:                for (int i = 0; i < numInstances(); i++) {
1295:                    text.append(instance(i));
1296:                    if (i < numInstances() - 1) {
1297:                        text.append('\n');
1298:                    }
1299:                }
1300:                return text.toString();
1301:            }
1302:
1303:            /**
1304:             * Creates the training set for one fold of a cross-validation 
1305:             * on the dataset. 
1306:             *
1307:             * @param numFolds the number of folds in the cross-validation. Must
1308:             * be greater than 1.
1309:             * @param numFold 0 for the first fold, 1 for the second, ...
1310:             * @return the training set 
1311:             * @throws IllegalArgumentException if the number of folds is less than 2
1312:             * or greater than the number of instances.
1313:             */
1314:            //@ requires 2 <= numFolds && numFolds < numInstances();
1315:            //@ requires 0 <= numFold && numFold < numFolds;
1316:            public Instances trainCV(int numFolds, int numFold) {
1317:
1318:                int numInstForFold, first, offset;
1319:                Instances train;
1320:
1321:                if (numFolds < 2) {
1322:                    throw new IllegalArgumentException(
1323:                            "Number of folds must be at least 2!");
1324:                }
1325:                if (numFolds > numInstances()) {
1326:                    throw new IllegalArgumentException(
1327:                            "Can't have more folds than instances!");
1328:                }
1329:                numInstForFold = numInstances() / numFolds;
1330:                if (numFold < numInstances() % numFolds) {
1331:                    numInstForFold++;
1332:                    offset = numFold;
1333:                } else
1334:                    offset = numInstances() % numFolds;
1335:                train = new Instances(this , numInstances() - numInstForFold);
1336:                first = numFold * (numInstances() / numFolds) + offset;
1337:                copyInstances(0, train, first);
1338:                copyInstances(first + numInstForFold, train, numInstances()
1339:                        - first - numInstForFold);
1340:
1341:                return train;
1342:            }
1343:
1344:            /**
1345:             * Creates the training set for one fold of a cross-validation 
1346:             * on the dataset. The data is subsequently randomized based
1347:             * on the given random number generator.
1348:             *
1349:             * @param numFolds the number of folds in the cross-validation. Must
1350:             * be greater than 1.
1351:             * @param numFold 0 for the first fold, 1 for the second, ...
1352:             * @param random the random number generator
1353:             * @return the training set 
1354:             * @throws IllegalArgumentException if the number of folds is less than 2
1355:             * or greater than the number of instances.
1356:             */
1357:            //@ requires 2 <= numFolds && numFolds < numInstances();
1358:            //@ requires 0 <= numFold && numFold < numFolds;
1359:            public Instances trainCV(int numFolds, int numFold, Random random) {
1360:
1361:                Instances train = trainCV(numFolds, numFold);
1362:                train.randomize(random);
1363:                return train;
1364:            }
1365:
1366:            /**
1367:             * Computes the variance for a numeric attribute.
1368:             *
1369:             * @param attIndex the numeric attribute (index starts with 0)
1370:             * @return the variance if the attribute is numeric
1371:             * @throws IllegalArgumentException if the attribute is not numeric
1372:             */
1373:            public/*@pure@*/double variance(int attIndex) {
1374:
1375:                double sum = 0, sumSquared = 0, sumOfWeights = 0;
1376:
1377:                if (!attribute(attIndex).isNumeric()) {
1378:                    throw new IllegalArgumentException(
1379:                            "Can't compute variance because attribute is "
1380:                                    + "not numeric!");
1381:                }
1382:                for (int i = 0; i < numInstances(); i++) {
1383:                    if (!instance(i).isMissing(attIndex)) {
1384:                        sum += instance(i).weight()
1385:                                * instance(i).value(attIndex);
1386:                        sumSquared += instance(i).weight()
1387:                                * instance(i).value(attIndex)
1388:                                * instance(i).value(attIndex);
1389:                        sumOfWeights += instance(i).weight();
1390:                    }
1391:                }
1392:                if (sumOfWeights <= 1) {
1393:                    return 0;
1394:                }
1395:                double result = (sumSquared - (sum * sum / sumOfWeights))
1396:                        / (sumOfWeights - 1);
1397:
1398:                // We don't like negative variance
1399:                if (result < 0) {
1400:                    return 0;
1401:                } else {
1402:                    return result;
1403:                }
1404:            }
1405:
1406:            /**
1407:             * Computes the variance for a numeric attribute.
1408:             *
1409:             * @param att the numeric attribute
1410:             * @return the variance if the attribute is numeric
1411:             * @throws IllegalArgumentException if the attribute is not numeric
1412:             */
1413:            public/*@pure@*/double variance(Attribute att) {
1414:
1415:                return variance(att.index());
1416:            }
1417:
1418:            /**
1419:             * Calculates summary statistics on the values that appear in this
1420:             * set of instances for a specified attribute.
1421:             *
1422:             * @param index the index of the attribute to summarize (index starts with 0)
1423:             * @return an AttributeStats object with it's fields calculated.
1424:             */
1425:            //@ requires 0 <= index && index < numAttributes();
1426:            public AttributeStats attributeStats(int index) {
1427:
1428:                AttributeStats result = new AttributeStats();
1429:                if (attribute(index).isNominal()) {
1430:                    result.nominalCounts = new int[attribute(index).numValues()];
1431:                }
1432:                if (attribute(index).isNumeric()) {
1433:                    result.numericStats = new weka.experiment.Stats();
1434:                }
1435:                result.totalCount = numInstances();
1436:
1437:                double[] attVals = attributeToDoubleArray(index);
1438:                int[] sorted = Utils.sort(attVals);
1439:                int currentCount = 0;
1440:                double prev = Instance.missingValue();
1441:                for (int j = 0; j < numInstances(); j++) {
1442:                    Instance current = instance(sorted[j]);
1443:                    if (current.isMissing(index)) {
1444:                        result.missingCount = numInstances() - j;
1445:                        break;
1446:                    }
1447:                    if (current.value(index) == prev) {
1448:                        currentCount++;
1449:                    } else {
1450:                        result.addDistinct(prev, currentCount);
1451:                        currentCount = 1;
1452:                        prev = current.value(index);
1453:                    }
1454:                }
1455:                result.addDistinct(prev, currentCount);
1456:                result.distinctCount--; // So we don't count "missing" as a value 
1457:                return result;
1458:            }
1459:
1460:            /**
1461:             * Gets the value of all instances in this dataset for a particular
1462:             * attribute. Useful in conjunction with Utils.sort to allow iterating
1463:             * through the dataset in sorted order for some attribute.
1464:             *
1465:             * @param index the index of the attribute.
1466:             * @return an array containing the value of the desired attribute for
1467:             * each instance in the dataset. 
1468:             */
1469:            //@ requires 0 <= index && index < numAttributes();
1470:            public/*@pure@*/double[] attributeToDoubleArray(int index) {
1471:
1472:                double[] result = new double[numInstances()];
1473:                for (int i = 0; i < result.length; i++) {
1474:                    result[i] = instance(i).value(index);
1475:                }
1476:                return result;
1477:            }
1478:
1479:            /**
1480:             * Generates a string summarizing the set of instances. Gives a breakdown
1481:             * for each attribute indicating the number of missing/discrete/unique
1482:             * values and other information.
1483:             *
1484:             * @return a string summarizing the dataset
1485:             */
1486:            public String toSummaryString() {
1487:
1488:                StringBuffer result = new StringBuffer();
1489:                result.append("Relation Name:  ").append(relationName())
1490:                        .append('\n');
1491:                result.append("Num Instances:  ").append(numInstances())
1492:                        .append('\n');
1493:                result.append("Num Attributes: ").append(numAttributes())
1494:                        .append('\n');
1495:                result.append('\n');
1496:
1497:                result.append(Utils.padLeft("", 5)).append(
1498:                        Utils.padRight("Name", 25));
1499:                result.append(Utils.padLeft("Type", 5)).append(
1500:                        Utils.padLeft("Nom", 5));
1501:                result.append(Utils.padLeft("Int", 5)).append(
1502:                        Utils.padLeft("Real", 5));
1503:                result.append(Utils.padLeft("Missing", 12));
1504:                result.append(Utils.padLeft("Unique", 12));
1505:                result.append(Utils.padLeft("Dist", 6)).append('\n');
1506:                for (int i = 0; i < numAttributes(); i++) {
1507:                    Attribute a = attribute(i);
1508:                    AttributeStats as = attributeStats(i);
1509:                    result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
1510:                    result.append(Utils.padRight(a.name(), 25)).append(' ');
1511:                    long percent;
1512:                    switch (a.type()) {
1513:                    case Attribute.NOMINAL:
1514:                        result.append(Utils.padLeft("Nom", 4)).append(' ');
1515:                        percent = Math.round(100.0 * as.intCount
1516:                                / as.totalCount);
1517:                        result.append(Utils.padLeft("" + percent, 3)).append(
1518:                                "% ");
1519:                        result.append(Utils.padLeft("" + 0, 3)).append("% ");
1520:                        percent = Math.round(100.0 * as.realCount
1521:                                / as.totalCount);
1522:                        result.append(Utils.padLeft("" + percent, 3)).append(
1523:                                "% ");
1524:                        break;
1525:                    case Attribute.NUMERIC:
1526:                        result.append(Utils.padLeft("Num", 4)).append(' ');
1527:                        result.append(Utils.padLeft("" + 0, 3)).append("% ");
1528:                        percent = Math.round(100.0 * as.intCount
1529:                                / as.totalCount);
1530:                        result.append(Utils.padLeft("" + percent, 3)).append(
1531:                                "% ");
1532:                        percent = Math.round(100.0 * as.realCount
1533:                                / as.totalCount);
1534:                        result.append(Utils.padLeft("" + percent, 3)).append(
1535:                                "% ");
1536:                        break;
1537:                    case Attribute.DATE:
1538:                        result.append(Utils.padLeft("Dat", 4)).append(' ');
1539:                        result.append(Utils.padLeft("" + 0, 3)).append("% ");
1540:                        percent = Math.round(100.0 * as.intCount
1541:                                / as.totalCount);
1542:                        result.append(Utils.padLeft("" + percent, 3)).append(
1543:                                "% ");
1544:                        percent = Math.round(100.0 * as.realCount
1545:                                / as.totalCount);
1546:                        result.append(Utils.padLeft("" + percent, 3)).append(
1547:                                "% ");
1548:                        break;
1549:                    case Attribute.STRING:
1550:                        result.append(Utils.padLeft("Str", 4)).append(' ');
1551:                        percent = Math.round(100.0 * as.intCount
1552:                                / as.totalCount);
1553:                        result.append(Utils.padLeft("" + percent, 3)).append(
1554:                                "% ");
1555:                        result.append(Utils.padLeft("" + 0, 3)).append("% ");
1556:                        percent = Math.round(100.0 * as.realCount
1557:                                / as.totalCount);
1558:                        result.append(Utils.padLeft("" + percent, 3)).append(
1559:                                "% ");
1560:                        break;
1561:                    case Attribute.RELATIONAL:
1562:                        result.append(Utils.padLeft("Rel", 4)).append(' ');
1563:                        percent = Math.round(100.0 * as.intCount
1564:                                / as.totalCount);
1565:                        result.append(Utils.padLeft("" + percent, 3)).append(
1566:                                "% ");
1567:                        result.append(Utils.padLeft("" + 0, 3)).append("% ");
1568:                        percent = Math.round(100.0 * as.realCount
1569:                                / as.totalCount);
1570:                        result.append(Utils.padLeft("" + percent, 3)).append(
1571:                                "% ");
1572:                        break;
1573:                    default:
1574:                        result.append(Utils.padLeft("???", 4)).append(' ');
1575:                        result.append(Utils.padLeft("" + 0, 3)).append("% ");
1576:                        percent = Math.round(100.0 * as.intCount
1577:                                / as.totalCount);
1578:                        result.append(Utils.padLeft("" + percent, 3)).append(
1579:                                "% ");
1580:                        percent = Math.round(100.0 * as.realCount
1581:                                / as.totalCount);
1582:                        result.append(Utils.padLeft("" + percent, 3)).append(
1583:                                "% ");
1584:                        break;
1585:                    }
1586:                    result.append(Utils.padLeft("" + as.missingCount, 5))
1587:                            .append(" /");
1588:                    percent = Math.round(100.0 * as.missingCount
1589:                            / as.totalCount);
1590:                    result.append(Utils.padLeft("" + percent, 3)).append("% ");
1591:                    result.append(Utils.padLeft("" + as.uniqueCount, 5))
1592:                            .append(" /");
1593:                    percent = Math
1594:                            .round(100.0 * as.uniqueCount / as.totalCount);
1595:                    result.append(Utils.padLeft("" + percent, 3)).append("% ");
1596:                    result.append(Utils.padLeft("" + as.distinctCount, 5))
1597:                            .append(' ');
1598:                    result.append('\n');
1599:                }
1600:                return result.toString();
1601:            }
1602:
1603:            /**
1604:             * Copies instances from one set to the end of another 
1605:             * one.
1606:             *
1607:             * @param from the position of the first instance to be copied
1608:             * @param dest the destination for the instances
1609:             * @param num the number of instances to be copied
1610:             */
1611:            //@ requires 0 <= from && from <= numInstances() - num;
1612:            //@ requires 0 <= num;
1613:            protected void copyInstances(int from, /*@non_null@*/
1614:                    Instances dest, int num) {
1615:
1616:                for (int i = 0; i < num; i++) {
1617:                    dest.add(instance(from + i));
1618:                }
1619:            }
1620:
1621:            /**
1622:             * Replaces the attribute information by a clone of
1623:             * itself.
1624:             */
1625:            protected void freshAttributeInfo() {
1626:
1627:                m_Attributes = (FastVector) m_Attributes.copyElements();
1628:            }
1629:
1630:            /**
1631:             * Returns string including all instances, their weights and
1632:             * their indices in the original dataset.
1633:             *
1634:             * @return description of instance and its weight as a string
1635:             */
1636:            protected/*@pure@*/String instancesAndWeights() {
1637:
1638:                StringBuffer text = new StringBuffer();
1639:
1640:                for (int i = 0; i < numInstances(); i++) {
1641:                    text.append(instance(i) + " " + instance(i).weight());
1642:                    if (i < numInstances() - 1) {
1643:                        text.append("\n");
1644:                    }
1645:                }
1646:                return text.toString();
1647:            }
1648:
1649:            /**
1650:             * Partitions the instances around a pivot. Used by quicksort and
1651:             * kthSmallestValue.
1652:             *
1653:             * @param attIndex the attribute's index (index starts with 0)
1654:             * @param l the first index of the subset (index starts with 0)
1655:             * @param r the last index of the subset (index starts with 0)
1656:             *
1657:             * @return the index of the middle element
1658:             */
1659:            //@ requires 0 <= attIndex && attIndex < numAttributes();
1660:            //@ requires 0 <= left && left <= right && right < numInstances();
1661:            protected int partition(int attIndex, int l, int r) {
1662:
1663:                double pivot = instance((l + r) / 2).value(attIndex);
1664:
1665:                while (l < r) {
1666:                    while ((instance(l).value(attIndex) < pivot) && (l < r)) {
1667:                        l++;
1668:                    }
1669:                    while ((instance(r).value(attIndex) > pivot) && (l < r)) {
1670:                        r--;
1671:                    }
1672:                    if (l < r) {
1673:                        swap(l, r);
1674:                        l++;
1675:                        r--;
1676:                    }
1677:                }
1678:                if ((l == r) && (instance(r).value(attIndex) > pivot)) {
1679:                    r--;
1680:                }
1681:
1682:                return r;
1683:            }
1684:
1685:            /**
1686:             * Implements quicksort according to Manber's "Introduction to
1687:             * Algorithms".
1688:             *
1689:             * @param attIndex the attribute's index (index starts with 0)
1690:             * @param left the first index of the subset to be sorted (index starts with 0)
1691:             * @param right the last index of the subset to be sorted (index starts with 0)
1692:             */
1693:            //@ requires 0 <= attIndex && attIndex < numAttributes();
1694:            //@ requires 0 <= first && first <= right && right < numInstances();
1695:            protected void quickSort(int attIndex, int left, int right) {
1696:
1697:                if (left < right) {
1698:                    int middle = partition(attIndex, left, right);
1699:                    quickSort(attIndex, left, middle);
1700:                    quickSort(attIndex, middle + 1, right);
1701:                }
1702:            }
1703:
1704:            /**
1705:             * Implements computation of the kth-smallest element according
1706:             * to Manber's "Introduction to Algorithms".
1707:             *
1708:             * @param attIndex the attribute's index (index starts with 0)
1709:             * @param left the first index of the subset (index starts with 0)
1710:             * @param right the last index of the subset (index starts with 0)
1711:             * @param k the value of k
1712:             *
1713:             * @return the index of the kth-smallest element
1714:             */
1715:            //@ requires 0 <= attIndex && attIndex < numAttributes();
1716:            //@ requires 0 <= first && first <= right && right < numInstances();
1717:            protected int select(int attIndex, int left, int right, int k) {
1718:
1719:                if (left == right) {
1720:                    return left;
1721:                } else {
1722:                    int middle = partition(attIndex, left, right);
1723:                    if ((middle - left + 1) >= k) {
1724:                        return select(attIndex, left, middle, k);
1725:                    } else {
1726:                        return select(attIndex, middle + 1, right, k
1727:                                - (middle - left + 1));
1728:                    }
1729:                }
1730:            }
1731:
1732:            /**
1733:             * Help function needed for stratification of set.
1734:             *
1735:             * @param numFolds the number of folds for the stratification
1736:             */
1737:            protected void stratStep(int numFolds) {
1738:
1739:                FastVector newVec = new FastVector(m_Instances.capacity());
1740:                int start = 0, j;
1741:
1742:                // create stratified batch
1743:                while (newVec.size() < numInstances()) {
1744:                    j = start;
1745:                    while (j < numInstances()) {
1746:                        newVec.addElement(instance(j));
1747:                        j = j + numFolds;
1748:                    }
1749:                    start++;
1750:                }
1751:                m_Instances = newVec;
1752:            }
1753:
1754:            /**
1755:             * Swaps two instances in the set.
1756:             *
1757:             * @param i the first instance's index (index starts with 0)
1758:             * @param j the second instance's index (index starts with 0)
1759:             */
1760:            //@ requires 0 <= i && i < numInstances();
1761:            //@ requires 0 <= j && j < numInstances();
1762:            public void swap(int i, int j) {
1763:
1764:                m_Instances.swap(i, j);
1765:            }
1766:
1767:            /**
1768:             * Merges two sets of Instances together. The resulting set will have
1769:             * all the attributes of the first set plus all the attributes of the 
1770:             * second set. The number of instances in both sets must be the same.
1771:             *
1772:             * @param first the first set of Instances
1773:             * @param second the second set of Instances
1774:             * @return the merged set of Instances
1775:             * @throws IllegalArgumentException if the datasets are not the same size
1776:             */
1777:            public static Instances mergeInstances(Instances first,
1778:                    Instances second) {
1779:
1780:                if (first.numInstances() != second.numInstances()) {
1781:                    throw new IllegalArgumentException(
1782:                            "Instance sets must be of the same size");
1783:                }
1784:
1785:                // Create the vector of merged attributes
1786:                FastVector newAttributes = new FastVector();
1787:                for (int i = 0; i < first.numAttributes(); i++) {
1788:                    newAttributes.addElement(first.attribute(i));
1789:                }
1790:                for (int i = 0; i < second.numAttributes(); i++) {
1791:                    newAttributes.addElement(second.attribute(i));
1792:                }
1793:
1794:                // Create the set of Instances
1795:                Instances merged = new Instances(first.relationName() + '_'
1796:                        + second.relationName(), newAttributes, first
1797:                        .numInstances());
1798:                // Merge each instance
1799:                for (int i = 0; i < first.numInstances(); i++) {
1800:                    merged.add(first.instance(i).mergeInstance(
1801:                            second.instance(i)));
1802:                }
1803:                return merged;
1804:            }
1805:
1806:            /**
1807:             * Method for testing this class.
1808:             *
1809:             * @param argv should contain one element: the name of an ARFF file
1810:             */
1811:            //@ requires argv != null;
1812:            //@ requires argv.length == 1;
1813:            //@ requires argv[0] != null;
1814:            public static void test(String[] argv) {
1815:
1816:                Instances instances, secondInstances, train, test, empty;
1817:                Random random = new Random(2);
1818:                Reader reader;
1819:                int start, num;
1820:                FastVector testAtts, testVals;
1821:                int i, j;
1822:
1823:                try {
1824:                    if (argv.length > 1) {
1825:                        throw (new Exception("Usage: Instances [<filename>]"));
1826:                    }
1827:
1828:                    // Creating set of instances from scratch
1829:                    testVals = new FastVector(2);
1830:                    testVals.addElement("first_value");
1831:                    testVals.addElement("second_value");
1832:                    testAtts = new FastVector(2);
1833:                    testAtts.addElement(new Attribute("nominal_attribute",
1834:                            testVals));
1835:                    testAtts.addElement(new Attribute("numeric_attribute"));
1836:                    instances = new Instances("test_set", testAtts, 10);
1837:                    instances.add(new Instance(instances.numAttributes()));
1838:                    instances.add(new Instance(instances.numAttributes()));
1839:                    instances.add(new Instance(instances.numAttributes()));
1840:                    instances.setClassIndex(0);
1841:                    System.out
1842:                            .println("\nSet of instances created from scratch:\n");
1843:                    System.out.println(instances);
1844:
1845:                    if (argv.length == 1) {
1846:                        String filename = argv[0];
1847:                        reader = new FileReader(filename);
1848:
1849:                        // Read first five instances and print them
1850:                        System.out
1851:                                .println("\nFirst five instances from file:\n");
1852:                        instances = new Instances(reader, 1);
1853:                        instances.setClassIndex(instances.numAttributes() - 1);
1854:                        i = 0;
1855:                        while ((i < 5) && (instances.readInstance(reader))) {
1856:                            i++;
1857:                        }
1858:                        System.out.println(instances);
1859:
1860:                        // Read all the instances in the file
1861:                        reader = new FileReader(filename);
1862:                        instances = new Instances(reader);
1863:
1864:                        // Make the last attribute be the class 
1865:                        instances.setClassIndex(instances.numAttributes() - 1);
1866:
1867:                        // Print header and instances.
1868:                        System.out.println("\nDataset:\n");
1869:                        System.out.println(instances);
1870:                        System.out.println("\nClass index: "
1871:                                + instances.classIndex());
1872:                    }
1873:
1874:                    // Test basic methods based on class index.
1875:                    System.out.println("\nClass name: "
1876:                            + instances.classAttribute().name());
1877:                    System.out.println("\nClass index: "
1878:                            + instances.classIndex());
1879:                    System.out.println("\nClass is nominal: "
1880:                            + instances.classAttribute().isNominal());
1881:                    System.out.println("\nClass is numeric: "
1882:                            + instances.classAttribute().isNumeric());
1883:                    System.out.println("\nClasses:\n");
1884:                    for (i = 0; i < instances.numClasses(); i++) {
1885:                        System.out.println(instances.classAttribute().value(i));
1886:                    }
1887:                    System.out
1888:                            .println("\nClass values and labels of instances:\n");
1889:                    for (i = 0; i < instances.numInstances(); i++) {
1890:                        Instance inst = instances.instance(i);
1891:                        System.out.print(inst.classValue() + "\t");
1892:                        System.out.print(inst.toString(inst.classIndex()));
1893:                        if (instances.instance(i).classIsMissing()) {
1894:                            System.out.println("\tis missing");
1895:                        } else {
1896:                            System.out.println();
1897:                        }
1898:                    }
1899:
1900:                    // Create random weights.
1901:                    System.out
1902:                            .println("\nCreating random weights for instances.");
1903:                    for (i = 0; i < instances.numInstances(); i++) {
1904:                        instances.instance(i).setWeight(random.nextDouble());
1905:                    }
1906:
1907:                    // Print all instances and their weights (and the sum of weights).
1908:                    System.out.println("\nInstances and their weights:\n");
1909:                    System.out.println(instances.instancesAndWeights());
1910:                    System.out.print("\nSum of weights: ");
1911:                    System.out.println(instances.sumOfWeights());
1912:
1913:                    // Insert an attribute
1914:                    secondInstances = new Instances(instances);
1915:                    Attribute testAtt = new Attribute("Inserted");
1916:                    secondInstances.insertAttributeAt(testAtt, 0);
1917:                    System.out.println("\nSet with inserted attribute:\n");
1918:                    System.out.println(secondInstances);
1919:                    System.out.println("\nClass name: "
1920:                            + secondInstances.classAttribute().name());
1921:
1922:                    // Delete the attribute
1923:                    secondInstances.deleteAttributeAt(0);
1924:                    System.out.println("\nSet with attribute deleted:\n");
1925:                    System.out.println(secondInstances);
1926:                    System.out.println("\nClass name: "
1927:                            + secondInstances.classAttribute().name());
1928:
1929:                    // Test if headers are equal
1930:                    System.out.println("\nHeaders equal: "
1931:                            + instances.equalHeaders(secondInstances) + "\n");
1932:
1933:                    // Print data in internal format.
1934:                    System.out.println("\nData (internal values):\n");
1935:                    for (i = 0; i < instances.numInstances(); i++) {
1936:                        for (j = 0; j < instances.numAttributes(); j++) {
1937:                            if (instances.instance(i).isMissing(j)) {
1938:                                System.out.print("? ");
1939:                            } else {
1940:                                System.out.print(instances.instance(i).value(j)
1941:                                        + " ");
1942:                            }
1943:                        }
1944:                        System.out.println();
1945:                    }
1946:
1947:                    // Just print header
1948:                    System.out.println("\nEmpty dataset:\n");
1949:                    empty = new Instances(instances, 0);
1950:                    System.out.println(empty);
1951:                    System.out.println("\nClass name: "
1952:                            + empty.classAttribute().name());
1953:
1954:                    // Create copy and rename an attribute and a value (if possible)
1955:                    if (empty.classAttribute().isNominal()) {
1956:                        Instances copy = new Instances(empty, 0);
1957:                        copy.renameAttribute(copy.classAttribute(), "new_name");
1958:                        copy.renameAttributeValue(copy.classAttribute(), copy
1959:                                .classAttribute().value(0), "new_val_name");
1960:                        System.out.println("\nDataset with names changed:\n"
1961:                                + copy);
1962:                        System.out.println("\nOriginal dataset:\n" + empty);
1963:                    }
1964:
1965:                    // Create and prints subset of instances.
1966:                    start = instances.numInstances() / 4;
1967:                    num = instances.numInstances() / 2;
1968:                    System.out.print("\nSubset of dataset: ");
1969:                    System.out.println(num + " instances from " + (start + 1)
1970:                            + ". instance");
1971:                    secondInstances = new Instances(instances, start, num);
1972:                    System.out.println("\nClass name: "
1973:                            + secondInstances.classAttribute().name());
1974:
1975:                    // Print all instances and their weights (and the sum of weights).
1976:                    System.out.println("\nInstances and their weights:\n");
1977:                    System.out.println(secondInstances.instancesAndWeights());
1978:                    System.out.print("\nSum of weights: ");
1979:                    System.out.println(secondInstances.sumOfWeights());
1980:
1981:                    // Create and print training and test sets for 3-fold
1982:                    // cross-validation.
1983:                    System.out.println("\nTrain and test folds for 3-fold CV:");
1984:                    if (instances.classAttribute().isNominal()) {
1985:                        instances.stratify(3);
1986:                    }
1987:                    for (j = 0; j < 3; j++) {
1988:                        train = instances.trainCV(3, j, new Random(1));
1989:                        test = instances.testCV(3, j);
1990:
1991:                        // Print all instances and their weights (and the sum of weights).
1992:                        System.out.println("\nTrain: ");
1993:                        System.out.println("\nInstances and their weights:\n");
1994:                        System.out.println(train.instancesAndWeights());
1995:                        System.out.print("\nSum of weights: ");
1996:                        System.out.println(train.sumOfWeights());
1997:                        System.out.println("\nClass name: "
1998:                                + train.classAttribute().name());
1999:                        System.out.println("\nTest: ");
2000:                        System.out.println("\nInstances and their weights:\n");
2001:                        System.out.println(test.instancesAndWeights());
2002:                        System.out.print("\nSum of weights: ");
2003:                        System.out.println(test.sumOfWeights());
2004:                        System.out.println("\nClass name: "
2005:                                + test.classAttribute().name());
2006:                    }
2007:
2008:                    // Randomize instances and print them.
2009:                    System.out.println("\nRandomized dataset:");
2010:                    instances.randomize(random);
2011:
2012:                    // Print all instances and their weights (and the sum of weights).
2013:                    System.out.println("\nInstances and their weights:\n");
2014:                    System.out.println(instances.instancesAndWeights());
2015:                    System.out.print("\nSum of weights: ");
2016:                    System.out.println(instances.sumOfWeights());
2017:
2018:                    // Sort instances according to first attribute and
2019:                    // print them.
2020:                    System.out
2021:                            .print("\nInstances sorted according to first attribute:\n ");
2022:                    instances.sort(0);
2023:
2024:                    // Print all instances and their weights (and the sum of weights).
2025:                    System.out.println("\nInstances and their weights:\n");
2026:                    System.out.println(instances.instancesAndWeights());
2027:                    System.out.print("\nSum of weights: ");
2028:                    System.out.println(instances.sumOfWeights());
2029:                } catch (Exception e) {
2030:                    e.printStackTrace();
2031:                }
2032:            }
2033:
2034:            /**
2035:             * Main method for this class. The following calls are possible:
2036:             * <ul>
2037:             *   <li>
2038:             *     <code>weka.core.Instances</code> help<br/>
2039:             *     prints a short list of possible commands.
2040:             *   </li>
2041:             *   <li>
2042:             *     <code>weka.core.Instances</code> &lt;filename&gt;<br/>
2043:             *     prints a summary of a set of instances.
2044:             *   </li>
2045:             *   <li>
2046:             *     <code>weka.core.Instances</code> merge &lt;filename1&gt; &lt;filename2&gt;<br/>
2047:             *     merges the two datasets (must have same number of instances) and
2048:             *     outputs the results on stdout.
2049:             *   </li>
2050:             *   <li>
2051:             *     <code>weka.core.Instances</code> append &lt;filename1&gt; &lt;filename2&gt;<br/>
2052:             *     appends the second dataset to the first one (must have same headers) and
2053:             *     outputs the results on stdout.
2054:             *   </li>
2055:             *   <li>
2056:             *     <code>weka.core.Instances</code> randomize &lt;seed&gt; &lt;filename&gt;<br/>
2057:             *     randomizes the dataset with the given seed and outputs the result on stdout.
2058:             *   </li>
2059:             * </ul>
2060:             *
2061:             * @param args 	the commandline parameters
2062:             */
2063:            public static void main(String[] args) {
2064:
2065:                try {
2066:                    Instances i;
2067:                    // read from stdin and print statistics
2068:                    if (args.length == 0) {
2069:                        DataSource source = new DataSource(System.in);
2070:                        i = source.getDataSet();
2071:                        System.out.println(i.toSummaryString());
2072:                    }
2073:                    // read file and print statistics
2074:                    else if ((args.length == 1) && (!args[0].equals("-h"))
2075:                            && (!args[0].equals("help"))) {
2076:                        DataSource source = new DataSource(args[0]);
2077:                        i = source.getDataSet();
2078:                        System.out.println(i.toSummaryString());
2079:                    }
2080:                    // read two files, merge them and print result to stdout
2081:                    else if ((args.length == 3)
2082:                            && (args[0].toLowerCase().equals("merge"))) {
2083:                        DataSource source1 = new DataSource(args[1]);
2084:                        DataSource source2 = new DataSource(args[2]);
2085:                        i = Instances.mergeInstances(source1.getDataSet(),
2086:                                source2.getDataSet());
2087:                        System.out.println(i);
2088:                    }
2089:                    // read two files, append them and print result to stdout
2090:                    else if ((args.length == 3)
2091:                            && (args[0].toLowerCase().equals("append"))) {
2092:                        DataSource source1 = new DataSource(args[1]);
2093:                        DataSource source2 = new DataSource(args[2]);
2094:                        if (!source1.getStructure().equalHeaders(
2095:                                source2.getStructure()))
2096:                            throw new Exception(
2097:                                    "The two datasets have different headers!");
2098:                        Instances structure = source1.getStructure();
2099:                        System.out.println(source1.getStructure());
2100:                        while (source1.hasMoreElements(structure))
2101:                            System.out.println(source1.nextElement(structure));
2102:                        structure = source2.getStructure();
2103:                        while (source2.hasMoreElements(structure))
2104:                            System.out.println(source2.nextElement(structure));
2105:                    }
2106:                    // read file and seed value, randomize data and print result to stdout
2107:                    else if ((args.length == 3)
2108:                            && (args[0].toLowerCase().equals("randomize"))) {
2109:                        DataSource source = new DataSource(args[2]);
2110:                        i = source.getDataSet();
2111:                        i.randomize(new Random(Integer.parseInt(args[1])));
2112:                        System.out.println(i);
2113:                    }
2114:                    // wrong parameters
2115:                    else {
2116:                        System.err
2117:                                .println("\nUsage:\n"
2118:                                        + "\tweka.core.Instances help\n"
2119:                                        + "\tweka.core.Instances <filename>\n"
2120:                                        + "\tweka.core.Instances merge <filename1> <filename2>\n"
2121:                                        + "\tweka.core.Instances append <filename1> <filename2>\n"
2122:                                        + "\tweka.core.Instances randomize <seed> <filename>\n");
2123:                        System.exit(1);
2124:                    }
2125:                } catch (Exception ex) {
2126:                    ex.printStackTrace();
2127:                    System.err.println(ex.getMessage());
2128:                }
2129:            }
2130:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.