Source Code Cross Referenced for CSVLoader.java in » Science » weka » weka » core » converters » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Science » weka » weka.core.converters
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         *    This program is free software; you can redistribute it and/or modify
003:         *    it under the terms of the GNU General Public License as published by
004:         *    the Free Software Foundation; either version 2 of the License, or
005:         *    (at your option) any later version.
006:         *
007:         *    This program is distributed in the hope that it will be useful,
008:         *    but WITHOUT ANY WARRANTY; without even the implied warranty of
009:         *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
010:         *    GNU General Public License for more details.
011:         *
012:         *    You should have received a copy of the GNU General Public License
013:         *    along with this program; if not, write to the Free Software
014:         *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015:         */
016:
017:        /*
018:         *    CSVLoader.java
019:         *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
020:         *
021:         */
022:
023:        package weka.core.converters;
024:
025:        import weka.core.Attribute;
026:        import weka.core.FastVector;
027:        import weka.core.Instance;
028:        import weka.core.Instances;
029:
030:        import java.io.BufferedReader;
031:        import java.io.FileNotFoundException;
032:        import java.io.FileReader;
033:        import java.io.IOException;
034:        import java.io.InputStream;
035:        import java.io.StreamTokenizer;
036:        import java.util.Enumeration;
037:        import java.util.Hashtable;
038:
039:        /**
040:         <!-- globalinfo-start -->
041:         * Reads a source that is in comma separated or tab separated format. Assumes that the first row in the file determines the number of and names of the attributes.
042:         * <p/>
043:         <!-- globalinfo-end -->
044:         *
045:         * @author Mark Hall (mhall@cs.waikato.ac.nz)
046:         * @version $Revision: 1.13 $
047:         * @see Loader
048:         */
049:        public class CSVLoader extends AbstractFileLoader implements 
050:                BatchConverter {
051:
052:            /** for serialization */
053:            static final long serialVersionUID = 5607529739745491340L;
054:
055:            /** the file extension */
056:            public static String FILE_EXTENSION = ".csv";
057:
058:            /**
059:             * A list of hash tables for accumulating nominal values during parsing.
060:             */
061:            private FastVector m_cumulativeStructure;
062:
063:            /**
064:             * Holds instances accumulated so far
065:             */
066:            private FastVector m_cumulativeInstances;
067:
068:            /**
069:             * default constructor
070:             */
071:            public CSVLoader() {
072:                // No instances retrieved yet
073:                setRetrieval(NONE);
074:            }
075:
076:            /**
077:             * Get the file extension used for arff files
078:             *
079:             * @return the file extension
080:             */
081:            public String getFileExtension() {
082:                return FILE_EXTENSION;
083:            }
084:
085:            /**
086:             * Returns a description of the file type.
087:             *
088:             * @return a short file description
089:             */
090:            public String getFileDescription() {
091:                return "CSV data files";
092:            }
093:
094:            /**
095:             * Gets all the file extensions used for this type of file
096:             *
097:             * @return the file extensions
098:             */
099:            public String[] getFileExtensions() {
100:                return new String[] { getFileExtension() };
101:            }
102:
103:            /**
104:             * Returns a string describing this attribute evaluator
105:             * @return a description of the evaluator suitable for
106:             * displaying in the explorer/experimenter gui
107:             */
108:            public String globalInfo() {
109:                return "Reads a source that is in comma separated or tab separated format. "
110:                        + "Assumes that the first row in the file determines the number of "
111:                        + "and names of the attributes.";
112:            }
113:
114:            /**
115:             * Is ignored and doesn't throw an Exception.
116:             *
117:             * @param input the input stream - ignored
118:             * @exception IOException always
119:             */
120:            public void setSource(InputStream input) throws IOException {
121:                // ignored
122:            }
123:
124:            /**
125:             * Determines and returns (if possible) the structure (internally the 
126:             * header) of the data set as an empty set of instances.
127:             *
128:             * @return the structure of the data set as an empty set of Instances
129:             * @exception IOException if an error occurs
130:             */
131:            public Instances getStructure() throws IOException {
132:                if (m_sourceFile == null) {
133:                    throw new IOException("No source has been specified");
134:                }
135:
136:                if (m_structure == null) {
137:                    try {
138:                        BufferedReader br = new BufferedReader(new FileReader(
139:                                m_sourceFile));
140:
141:                        // assumes that the first line of the file is the header
142:                        /*m_tokenizer = new StreamTokenizer(br);
143:                        initTokenizer(m_tokenizer);
144:                        readHeader(m_tokenizer); */
145:                        StreamTokenizer st = new StreamTokenizer(br);
146:                        initTokenizer(st);
147:                        readStructure(st);
148:                    } catch (FileNotFoundException ex) {
149:                    }
150:                }
151:
152:                return m_structure;
153:            }
154:
155:            /**
156:             * reads the structure
157:             * 
158:             * @param st the stream tokenizer to read from
159:             * @throws IOException if reading fails
160:             */
161:            private void readStructure(StreamTokenizer st) throws IOException {
162:                readHeader(st);
163:            }
164:
165:            /**
166:             * Return the full data set. If the structure hasn't yet been determined
167:             * by a call to getStructure then method should do so before processing
168:             * the rest of the data set.
169:             *
170:             * @return the structure of the data set as an empty set of Instances
171:             * @exception IOException if there is no source or parsing fails
172:             */
173:            public Instances getDataSet() throws IOException {
174:                if (m_sourceFile == null) {
175:                    throw new IOException("No source has been specified");
176:                }
177:                //    m_sourceReader.close();
178:                setSource(m_sourceFile);
179:                BufferedReader br = new BufferedReader(new FileReader(
180:                        m_sourceFile));
181:                //    getStructure();
182:                StreamTokenizer st = new StreamTokenizer(br);
183:                initTokenizer(st);
184:                readStructure(st);
185:
186:                st.ordinaryChar(',');
187:                st.ordinaryChar('\t');
188:
189:                m_cumulativeStructure = new FastVector(m_structure
190:                        .numAttributes());
191:                for (int i = 0; i < m_structure.numAttributes(); i++) {
192:                    m_cumulativeStructure.addElement(new Hashtable());
193:                }
194:
195:                // Instances result = new Instances(m_structure);
196:                m_cumulativeInstances = new FastVector();
197:                FastVector current;
198:                while ((current = getInstance(st)) != null) {
199:                    m_cumulativeInstances.addElement(current);
200:                }
201:                br.close();
202:                // now determine the true structure of the data set
203:                FastVector atts = new FastVector(m_structure.numAttributes());
204:                for (int i = 0; i < m_structure.numAttributes(); i++) {
205:                    String attname = m_structure.attribute(i).name();
206:                    Hashtable tempHash = ((Hashtable) m_cumulativeStructure
207:                            .elementAt(i));
208:                    if (tempHash.size() == 0) {
209:                        atts.addElement(new Attribute(attname));
210:                    } else {
211:                        FastVector values = new FastVector(tempHash.size());
212:                        // add dummy objects in order to make the FastVector's size == capacity
213:                        for (int z = 0; z < tempHash.size(); z++) {
214:                            values.addElement("dummy");
215:                        }
216:                        Enumeration e = tempHash.keys();
217:                        while (e.hasMoreElements()) {
218:                            Object ob = e.nextElement();
219:                            //	  if (ob instanceof Double) {
220:                            int index = ((Integer) tempHash.get(ob)).intValue();
221:                            values.setElementAt(new String(ob.toString()),
222:                                    index);
223:                            //	  }
224:                        }
225:                        atts.addElement(new Attribute(attname, values));
226:                    }
227:                }
228:
229:                // make the instances
230:                String relationName = (m_sourceFile.getName()).replaceAll(
231:                        "\\.[cC][sS][vV]$", "");
232:                Instances dataSet = new Instances(relationName, atts,
233:                        m_cumulativeInstances.size());
234:
235:                for (int i = 0; i < m_cumulativeInstances.size(); i++) {
236:                    current = ((FastVector) m_cumulativeInstances.elementAt(i));
237:                    double[] vals = new double[dataSet.numAttributes()];
238:                    for (int j = 0; j < current.size(); j++) {
239:                        Object cval = current.elementAt(j);
240:                        if (cval instanceof  String) {
241:                            if (((String) cval).compareTo("?") == 0) {
242:                                vals[j] = Instance.missingValue();
243:                            } else {
244:                                if (!dataSet.attribute(j).isNominal()) {
245:                                    System.err
246:                                            .println("Wrong attribute type!!!");
247:                                    System.exit(1);
248:                                }
249:                                // find correct index
250:                                Hashtable lookup = (Hashtable) m_cumulativeStructure
251:                                        .elementAt(j);
252:                                int index = ((Integer) lookup.get(cval))
253:                                        .intValue();
254:                                vals[j] = (double) index;
255:                            }
256:                        } else if (dataSet.attribute(j).isNominal()) {
257:                            // find correct index
258:                            Hashtable lookup = (Hashtable) m_cumulativeStructure
259:                                    .elementAt(j);
260:                            int index = ((Integer) lookup.get(cval)).intValue();
261:                            vals[j] = (double) index;
262:                        } else {
263:                            vals[j] = ((Double) cval).doubleValue();
264:                        }
265:                    }
266:                    dataSet.add(new Instance(1.0, vals));
267:                }
268:                m_structure = new Instances(dataSet, 0);
269:                setRetrieval(BATCH);
270:                m_cumulativeStructure = null; // conserve memory
271:                return dataSet;
272:            }
273:
274:            /**
275:             * CSVLoader is unable to process a data set incrementally.
276:             *
277:             * @param structure ignored
278:             * @return never returns without throwing an exception
279:             * @exception IOException always. CSVLoader is unable to process a data
280:             * set incrementally.
281:             */
282:            public Instance getNextInstance(Instances structure)
283:                    throws IOException {
284:                throw new IOException(
285:                        "CSVLoader can't read data sets incrementally.");
286:            }
287:
288:            /**
289:             * Attempts to parse a line of the data set.
290:             *
291:             * @param tokenizer the tokenizer
292:             * @return a FastVector containg String and Double objects representing
293:             * the values of the instance.
294:             * @exception IOException if an error occurs
295:             *
296:             * <pre><jml>
297:             *    private_normal_behavior
298:             *      requires: tokenizer != null;
299:             *      ensures: \result  != null;
300:             *  also
301:             *    private_exceptional_behavior
302:             *      requires: tokenizer == null
303:             *                || (* unsucessful parse *);
304:             *      signals: (IOException);
305:             * </jml></pre>
306:             */
307:            private FastVector getInstance(StreamTokenizer tokenizer)
308:                    throws IOException {
309:
310:                FastVector current = new FastVector();
311:
312:                // Check if end of file reached.
313:                ConverterUtils.getFirstToken(tokenizer);
314:                if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
315:                    return null;
316:                }
317:                boolean first = true;
318:                boolean wasSep;
319:
320:                while (tokenizer.ttype != StreamTokenizer.TT_EOL
321:                        && tokenizer.ttype != StreamTokenizer.TT_EOF) {
322:
323:                    // Get next token
324:                    if (!first) {
325:                        ConverterUtils.getToken(tokenizer);
326:                    }
327:
328:                    if (tokenizer.ttype == ',' || tokenizer.ttype == '\t'
329:                            || tokenizer.ttype == StreamTokenizer.TT_EOL) {
330:                        current.addElement("?");
331:                        wasSep = true;
332:                    } else {
333:                        wasSep = false;
334:                        /* // Check if token is valid.
335:                        if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
336:                          errms(tokenizer,"not a valid value");
337:                          }*/
338:
339:                        // try to parse as a number
340:                        try {
341:                            double val = Double.valueOf(tokenizer.sval)
342:                                    .doubleValue();
343:                            current.addElement(new Double(val));
344:                        } catch (NumberFormatException e) {
345:                            // otherwise assume its an enumerated value
346:                            current.addElement(new String(tokenizer.sval
347:                                    .replace(' ', '_')));
348:                        }
349:                    }
350:
351:                    if (!wasSep) {
352:                        ConverterUtils.getToken(tokenizer);
353:                    }
354:                    first = false;
355:                }
356:
357:                // check number of values read
358:                if (current.size() != m_structure.numAttributes()) {
359:                    ConverterUtils.errms(tokenizer,
360:                            "wrong number of values. Read " + current.size()
361:                                    + ", expected "
362:                                    + m_structure.numAttributes());
363:                }
364:
365:                // check for structure update
366:                try {
367:                    checkStructure(current);
368:                } catch (Exception ex) {
369:                    ex.printStackTrace();
370:                }
371:
372:                return current;
373:            }
374:
375:            /**
376:             * Checks the current instance against what is known about the structure
377:             * of the data set so far. If there is a nominal value for an attribute
378:             * that was beleived to be numeric then all previously seen values for this
379:             * attribute are stored in a Hashtable.
380:             *
381:             * @param current a <code>FastVector</code> value
382:             * @exception Exception if an error occurs
383:             *
384:             * <pre><jml>
385:             *    private_normal_behavior
386:             *      requires: current != null;
387:             *  also
388:             *    private_exceptional_behavior
389:             *      requires: current == null
390:             *                || (* unrecognized object type in current *);
391:             *      signals: (Exception);
392:             * </jml></pre>
393:             */
394:            private void checkStructure(FastVector current) throws Exception {
395:                if (current == null) {
396:                    throw new Exception(
397:                            "current shouldn't be null in checkStructure");
398:                }
399:                for (int i = 0; i < current.size(); i++) {
400:                    Object ob = current.elementAt(i);
401:                    if (ob instanceof  String) {
402:                        if (((String) ob).compareTo("?") == 0) {
403:                        } else {
404:                            Hashtable tempHash = (Hashtable) m_cumulativeStructure
405:                                    .elementAt(i);
406:                            if (!tempHash.containsKey(ob)) {
407:                                // may have found a nominal value in what was previously thought to
408:                                // be a numeric variable.
409:                                if (tempHash.size() == 0) {
410:                                    for (int j = 0; j < m_cumulativeInstances
411:                                            .size(); j++) {
412:                                        FastVector tempUpdate = ((FastVector) m_cumulativeInstances
413:                                                .elementAt(j));
414:                                        Object tempO = tempUpdate.elementAt(i);
415:                                        if (tempO instanceof  String) {
416:                                            // must have been a missing value
417:                                        } else {
418:                                            if (!tempHash.containsKey(tempO)) {
419:                                                tempHash
420:                                                        .put(
421:                                                                new Double(
422:                                                                        ((Double) tempO)
423:                                                                                .doubleValue()),
424:                                                                new Integer(
425:                                                                        tempHash
426:                                                                                .size()));
427:                                            }
428:                                        }
429:                                    }
430:                                }
431:                                int newIndex = tempHash.size();
432:                                tempHash.put(ob, new Integer(newIndex));
433:                            }
434:                        }
435:                    } else if (ob instanceof  Double) {
436:                        Hashtable tempHash = (Hashtable) m_cumulativeStructure
437:                                .elementAt(i);
438:                        if (tempHash.size() != 0) {
439:                            if (!tempHash.containsKey(ob)) {
440:                                int newIndex = tempHash.size();
441:                                tempHash.put(new Double(((Double) ob)
442:                                        .doubleValue()), new Integer(newIndex));
443:                            }
444:                        }
445:                    } else {
446:                        throw new Exception(
447:                                "Wrong object type in checkStructure!");
448:                    }
449:                }
450:            }
451:
452:            /**
453:             * Assumes the first line of the file contains the attribute names.
454:             * Assumes all attributes are real (Reading the full data set with
455:             * getDataSet will establish the true structure).
456:             *
457:             * @param tokenizer a <code>StreamTokenizer</code> value
458:             * @exception IOException if an error occurs
459:             *
460:             * <pre><jml>
461:             *    private_normal_behavior
462:             *      requires: tokenizer != null;
463:             *      modifiable: m_structure;
464:             *      ensures: m_structure != null;
465:             *  also
466:             *    private_exceptional_behavior
467:             *      requires: tokenizer == null
468:             *                || (* unsucessful parse *);
469:             *      signals: (IOException);
470:             * </jml></pre>
471:             */
472:            private void readHeader(StreamTokenizer tokenizer)
473:                    throws IOException {
474:
475:                FastVector attribNames = new FastVector();
476:                ConverterUtils.getFirstToken(tokenizer);
477:                if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
478:                    ConverterUtils.errms(tokenizer, "premature end of file");
479:                }
480:
481:                while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
482:                    attribNames.addElement(new Attribute(tokenizer.sval));
483:                    ConverterUtils.getToken(tokenizer);
484:                }
485:                String relationName = (m_sourceFile.getName()).replaceAll(
486:                        "\\.[cC][sS][vV]$", "");
487:                m_structure = new Instances(relationName, attribNames, 0);
488:            }
489:
490:            /**
491:             * Initializes the stream tokenizer
492:             *
493:             * @param tokenizer the tokenizer to initialize
494:             */
495:            private void initTokenizer(StreamTokenizer tokenizer) {
496:                tokenizer.resetSyntax();
497:                tokenizer.whitespaceChars(0, (' ' - 1));
498:                tokenizer.wordChars(' ', '\u00FF');
499:                tokenizer.whitespaceChars(',', ',');
500:                tokenizer.whitespaceChars('\t', '\t');
501:                //    tokenizer.ordinaryChar(',');
502:                tokenizer.commentChar('%');
503:                tokenizer.quoteChar('"');
504:                tokenizer.quoteChar('\'');
505:                //    tokenizer.ordinaryChar('{');
506:                //    tokenizer.ordinaryChar('}');
507:                tokenizer.eolIsSignificant(true);
508:            }
509:
510:            /**
511:             * Main method.
512:             *
513:             * @param args should contain the name of an input file.
514:             */
515:            public static void main(String[] args) {
516:                runFileLoader(new CSVLoader(), args);
517:            }
518:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.