001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * CSVLoader.java
019: * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.core.converters;
024:
025: import weka.core.Attribute;
026: import weka.core.FastVector;
027: import weka.core.Instance;
028: import weka.core.Instances;
029:
030: import java.io.BufferedReader;
031: import java.io.FileNotFoundException;
032: import java.io.FileReader;
033: import java.io.IOException;
034: import java.io.InputStream;
035: import java.io.StreamTokenizer;
036: import java.util.Enumeration;
037: import java.util.Hashtable;
038:
039: /**
040: <!-- globalinfo-start -->
041: * Reads a source that is in comma separated or tab separated format. Assumes that the first row in the file determines the number of and names of the attributes.
042: * <p/>
043: <!-- globalinfo-end -->
044: *
045: * @author Mark Hall (mhall@cs.waikato.ac.nz)
046: * @version $Revision: 1.13 $
047: * @see Loader
048: */
049: public class CSVLoader extends AbstractFileLoader implements
050: BatchConverter {
051:
052: /** for serialization */
053: static final long serialVersionUID = 5607529739745491340L;
054:
055: /** the file extension */
056: public static String FILE_EXTENSION = ".csv";
057:
058: /**
059: * A list of hash tables for accumulating nominal values during parsing.
060: */
061: private FastVector m_cumulativeStructure;
062:
063: /**
064: * Holds instances accumulated so far
065: */
066: private FastVector m_cumulativeInstances;
067:
068: /**
069: * default constructor
070: */
071: public CSVLoader() {
072: // No instances retrieved yet
073: setRetrieval(NONE);
074: }
075:
076: /**
077: * Get the file extension used for arff files
078: *
079: * @return the file extension
080: */
081: public String getFileExtension() {
082: return FILE_EXTENSION;
083: }
084:
085: /**
086: * Returns a description of the file type.
087: *
088: * @return a short file description
089: */
090: public String getFileDescription() {
091: return "CSV data files";
092: }
093:
094: /**
095: * Gets all the file extensions used for this type of file
096: *
097: * @return the file extensions
098: */
099: public String[] getFileExtensions() {
100: return new String[] { getFileExtension() };
101: }
102:
103: /**
104: * Returns a string describing this attribute evaluator
105: * @return a description of the evaluator suitable for
106: * displaying in the explorer/experimenter gui
107: */
108: public String globalInfo() {
109: return "Reads a source that is in comma separated or tab separated format. "
110: + "Assumes that the first row in the file determines the number of "
111: + "and names of the attributes.";
112: }
113:
114: /**
115: * Is ignored and doesn't throw an Exception.
116: *
117: * @param input the input stream - ignored
118: * @exception IOException always
119: */
120: public void setSource(InputStream input) throws IOException {
121: // ignored
122: }
123:
124: /**
125: * Determines and returns (if possible) the structure (internally the
126: * header) of the data set as an empty set of instances.
127: *
128: * @return the structure of the data set as an empty set of Instances
129: * @exception IOException if an error occurs
130: */
131: public Instances getStructure() throws IOException {
132: if (m_sourceFile == null) {
133: throw new IOException("No source has been specified");
134: }
135:
136: if (m_structure == null) {
137: try {
138: BufferedReader br = new BufferedReader(new FileReader(
139: m_sourceFile));
140:
141: // assumes that the first line of the file is the header
142: /*m_tokenizer = new StreamTokenizer(br);
143: initTokenizer(m_tokenizer);
144: readHeader(m_tokenizer); */
145: StreamTokenizer st = new StreamTokenizer(br);
146: initTokenizer(st);
147: readStructure(st);
148: } catch (FileNotFoundException ex) {
149: }
150: }
151:
152: return m_structure;
153: }
154:
155: /**
156: * reads the structure
157: *
158: * @param st the stream tokenizer to read from
159: * @throws IOException if reading fails
160: */
161: private void readStructure(StreamTokenizer st) throws IOException {
162: readHeader(st);
163: }
164:
165: /**
166: * Return the full data set. If the structure hasn't yet been determined
167: * by a call to getStructure then method should do so before processing
168: * the rest of the data set.
169: *
170: * @return the structure of the data set as an empty set of Instances
171: * @exception IOException if there is no source or parsing fails
172: */
173: public Instances getDataSet() throws IOException {
174: if (m_sourceFile == null) {
175: throw new IOException("No source has been specified");
176: }
177: // m_sourceReader.close();
178: setSource(m_sourceFile);
179: BufferedReader br = new BufferedReader(new FileReader(
180: m_sourceFile));
181: // getStructure();
182: StreamTokenizer st = new StreamTokenizer(br);
183: initTokenizer(st);
184: readStructure(st);
185:
186: st.ordinaryChar(',');
187: st.ordinaryChar('\t');
188:
189: m_cumulativeStructure = new FastVector(m_structure
190: .numAttributes());
191: for (int i = 0; i < m_structure.numAttributes(); i++) {
192: m_cumulativeStructure.addElement(new Hashtable());
193: }
194:
195: // Instances result = new Instances(m_structure);
196: m_cumulativeInstances = new FastVector();
197: FastVector current;
198: while ((current = getInstance(st)) != null) {
199: m_cumulativeInstances.addElement(current);
200: }
201: br.close();
202: // now determine the true structure of the data set
203: FastVector atts = new FastVector(m_structure.numAttributes());
204: for (int i = 0; i < m_structure.numAttributes(); i++) {
205: String attname = m_structure.attribute(i).name();
206: Hashtable tempHash = ((Hashtable) m_cumulativeStructure
207: .elementAt(i));
208: if (tempHash.size() == 0) {
209: atts.addElement(new Attribute(attname));
210: } else {
211: FastVector values = new FastVector(tempHash.size());
212: // add dummy objects in order to make the FastVector's size == capacity
213: for (int z = 0; z < tempHash.size(); z++) {
214: values.addElement("dummy");
215: }
216: Enumeration e = tempHash.keys();
217: while (e.hasMoreElements()) {
218: Object ob = e.nextElement();
219: // if (ob instanceof Double) {
220: int index = ((Integer) tempHash.get(ob)).intValue();
221: values.setElementAt(new String(ob.toString()),
222: index);
223: // }
224: }
225: atts.addElement(new Attribute(attname, values));
226: }
227: }
228:
229: // make the instances
230: String relationName = (m_sourceFile.getName()).replaceAll(
231: "\\.[cC][sS][vV]$", "");
232: Instances dataSet = new Instances(relationName, atts,
233: m_cumulativeInstances.size());
234:
235: for (int i = 0; i < m_cumulativeInstances.size(); i++) {
236: current = ((FastVector) m_cumulativeInstances.elementAt(i));
237: double[] vals = new double[dataSet.numAttributes()];
238: for (int j = 0; j < current.size(); j++) {
239: Object cval = current.elementAt(j);
240: if (cval instanceof String) {
241: if (((String) cval).compareTo("?") == 0) {
242: vals[j] = Instance.missingValue();
243: } else {
244: if (!dataSet.attribute(j).isNominal()) {
245: System.err
246: .println("Wrong attribute type!!!");
247: System.exit(1);
248: }
249: // find correct index
250: Hashtable lookup = (Hashtable) m_cumulativeStructure
251: .elementAt(j);
252: int index = ((Integer) lookup.get(cval))
253: .intValue();
254: vals[j] = (double) index;
255: }
256: } else if (dataSet.attribute(j).isNominal()) {
257: // find correct index
258: Hashtable lookup = (Hashtable) m_cumulativeStructure
259: .elementAt(j);
260: int index = ((Integer) lookup.get(cval)).intValue();
261: vals[j] = (double) index;
262: } else {
263: vals[j] = ((Double) cval).doubleValue();
264: }
265: }
266: dataSet.add(new Instance(1.0, vals));
267: }
268: m_structure = new Instances(dataSet, 0);
269: setRetrieval(BATCH);
270: m_cumulativeStructure = null; // conserve memory
271: return dataSet;
272: }
273:
274: /**
275: * CSVLoader is unable to process a data set incrementally.
276: *
277: * @param structure ignored
278: * @return never returns without throwing an exception
279: * @exception IOException always. CSVLoader is unable to process a data
280: * set incrementally.
281: */
282: public Instance getNextInstance(Instances structure)
283: throws IOException {
284: throw new IOException(
285: "CSVLoader can't read data sets incrementally.");
286: }
287:
288: /**
289: * Attempts to parse a line of the data set.
290: *
291: * @param tokenizer the tokenizer
292: * @return a FastVector containg String and Double objects representing
293: * the values of the instance.
294: * @exception IOException if an error occurs
295: *
296: * <pre><jml>
297: * private_normal_behavior
298: * requires: tokenizer != null;
299: * ensures: \result != null;
300: * also
301: * private_exceptional_behavior
302: * requires: tokenizer == null
303: * || (* unsucessful parse *);
304: * signals: (IOException);
305: * </jml></pre>
306: */
307: private FastVector getInstance(StreamTokenizer tokenizer)
308: throws IOException {
309:
310: FastVector current = new FastVector();
311:
312: // Check if end of file reached.
313: ConverterUtils.getFirstToken(tokenizer);
314: if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
315: return null;
316: }
317: boolean first = true;
318: boolean wasSep;
319:
320: while (tokenizer.ttype != StreamTokenizer.TT_EOL
321: && tokenizer.ttype != StreamTokenizer.TT_EOF) {
322:
323: // Get next token
324: if (!first) {
325: ConverterUtils.getToken(tokenizer);
326: }
327:
328: if (tokenizer.ttype == ',' || tokenizer.ttype == '\t'
329: || tokenizer.ttype == StreamTokenizer.TT_EOL) {
330: current.addElement("?");
331: wasSep = true;
332: } else {
333: wasSep = false;
334: /* // Check if token is valid.
335: if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
336: errms(tokenizer,"not a valid value");
337: }*/
338:
339: // try to parse as a number
340: try {
341: double val = Double.valueOf(tokenizer.sval)
342: .doubleValue();
343: current.addElement(new Double(val));
344: } catch (NumberFormatException e) {
345: // otherwise assume its an enumerated value
346: current.addElement(new String(tokenizer.sval
347: .replace(' ', '_')));
348: }
349: }
350:
351: if (!wasSep) {
352: ConverterUtils.getToken(tokenizer);
353: }
354: first = false;
355: }
356:
357: // check number of values read
358: if (current.size() != m_structure.numAttributes()) {
359: ConverterUtils.errms(tokenizer,
360: "wrong number of values. Read " + current.size()
361: + ", expected "
362: + m_structure.numAttributes());
363: }
364:
365: // check for structure update
366: try {
367: checkStructure(current);
368: } catch (Exception ex) {
369: ex.printStackTrace();
370: }
371:
372: return current;
373: }
374:
375: /**
376: * Checks the current instance against what is known about the structure
377: * of the data set so far. If there is a nominal value for an attribute
378: * that was beleived to be numeric then all previously seen values for this
379: * attribute are stored in a Hashtable.
380: *
381: * @param current a <code>FastVector</code> value
382: * @exception Exception if an error occurs
383: *
384: * <pre><jml>
385: * private_normal_behavior
386: * requires: current != null;
387: * also
388: * private_exceptional_behavior
389: * requires: current == null
390: * || (* unrecognized object type in current *);
391: * signals: (Exception);
392: * </jml></pre>
393: */
394: private void checkStructure(FastVector current) throws Exception {
395: if (current == null) {
396: throw new Exception(
397: "current shouldn't be null in checkStructure");
398: }
399: for (int i = 0; i < current.size(); i++) {
400: Object ob = current.elementAt(i);
401: if (ob instanceof String) {
402: if (((String) ob).compareTo("?") == 0) {
403: } else {
404: Hashtable tempHash = (Hashtable) m_cumulativeStructure
405: .elementAt(i);
406: if (!tempHash.containsKey(ob)) {
407: // may have found a nominal value in what was previously thought to
408: // be a numeric variable.
409: if (tempHash.size() == 0) {
410: for (int j = 0; j < m_cumulativeInstances
411: .size(); j++) {
412: FastVector tempUpdate = ((FastVector) m_cumulativeInstances
413: .elementAt(j));
414: Object tempO = tempUpdate.elementAt(i);
415: if (tempO instanceof String) {
416: // must have been a missing value
417: } else {
418: if (!tempHash.containsKey(tempO)) {
419: tempHash
420: .put(
421: new Double(
422: ((Double) tempO)
423: .doubleValue()),
424: new Integer(
425: tempHash
426: .size()));
427: }
428: }
429: }
430: }
431: int newIndex = tempHash.size();
432: tempHash.put(ob, new Integer(newIndex));
433: }
434: }
435: } else if (ob instanceof Double) {
436: Hashtable tempHash = (Hashtable) m_cumulativeStructure
437: .elementAt(i);
438: if (tempHash.size() != 0) {
439: if (!tempHash.containsKey(ob)) {
440: int newIndex = tempHash.size();
441: tempHash.put(new Double(((Double) ob)
442: .doubleValue()), new Integer(newIndex));
443: }
444: }
445: } else {
446: throw new Exception(
447: "Wrong object type in checkStructure!");
448: }
449: }
450: }
451:
452: /**
453: * Assumes the first line of the file contains the attribute names.
454: * Assumes all attributes are real (Reading the full data set with
455: * getDataSet will establish the true structure).
456: *
457: * @param tokenizer a <code>StreamTokenizer</code> value
458: * @exception IOException if an error occurs
459: *
460: * <pre><jml>
461: * private_normal_behavior
462: * requires: tokenizer != null;
463: * modifiable: m_structure;
464: * ensures: m_structure != null;
465: * also
466: * private_exceptional_behavior
467: * requires: tokenizer == null
468: * || (* unsucessful parse *);
469: * signals: (IOException);
470: * </jml></pre>
471: */
472: private void readHeader(StreamTokenizer tokenizer)
473: throws IOException {
474:
475: FastVector attribNames = new FastVector();
476: ConverterUtils.getFirstToken(tokenizer);
477: if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
478: ConverterUtils.errms(tokenizer, "premature end of file");
479: }
480:
481: while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
482: attribNames.addElement(new Attribute(tokenizer.sval));
483: ConverterUtils.getToken(tokenizer);
484: }
485: String relationName = (m_sourceFile.getName()).replaceAll(
486: "\\.[cC][sS][vV]$", "");
487: m_structure = new Instances(relationName, attribNames, 0);
488: }
489:
490: /**
491: * Initializes the stream tokenizer
492: *
493: * @param tokenizer the tokenizer to initialize
494: */
495: private void initTokenizer(StreamTokenizer tokenizer) {
496: tokenizer.resetSyntax();
497: tokenizer.whitespaceChars(0, (' ' - 1));
498: tokenizer.wordChars(' ', '\u00FF');
499: tokenizer.whitespaceChars(',', ',');
500: tokenizer.whitespaceChars('\t', '\t');
501: // tokenizer.ordinaryChar(',');
502: tokenizer.commentChar('%');
503: tokenizer.quoteChar('"');
504: tokenizer.quoteChar('\'');
505: // tokenizer.ordinaryChar('{');
506: // tokenizer.ordinaryChar('}');
507: tokenizer.eolIsSignificant(true);
508: }
509:
510: /**
511: * Main method.
512: *
513: * @param args should contain the name of an input file.
514: */
515: public static void main(String[] args) {
516: runFileLoader(new CSVLoader(), args);
517: }
518: }
|