001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * LibSVMLoader.java
019: * Copyright (C) 2006 University of Waikato, Hamilton, NZ
020: *
021: */
022:
023: package weka.core.converters;
024:
025: import weka.core.Attribute;
026: import weka.core.FastVector;
027: import weka.core.Instance;
028: import weka.core.Instances;
029:
030: import java.io.BufferedReader;
031: import java.io.File;
032: import java.io.IOException;
033: import java.io.InputStream;
034: import java.io.InputStreamReader;
035: import java.io.Reader;
036: import java.net.URL;
037: import java.util.StringTokenizer;
038: import java.util.Vector;
039:
040: /**
041: <!-- globalinfo-start -->
042: * Reads a source that is in libsvm format.<br/>
043: * <br/>
044: * For more information about libsvm see:<br/>
045: * <br/>
046: * http://www.csie.ntu.edu.tw/~cjlin/libsvm/
047: * <p/>
048: <!-- globalinfo-end -->
049: *
050: * @author FracPete (fracpete at waikato dot ac dot nz)
051: * @version $Revision: 1.3 $
052: * @see Loader
053: */
054: public class LibSVMLoader extends AbstractFileLoader implements
055: BatchConverter, URLSourcedLoader {
056:
057: /** for serialization */
058: private static final long serialVersionUID = 4988360125354664417L;
059:
060: /** the file extension */
061: public static String FILE_EXTENSION = ".libsvm";
062:
063: /** the url */
064: protected String m_URL = "http://";
065:
066: /** The reader for the source file. */
067: protected transient Reader m_sourceReader = null;
068:
069: /** the buffer of the rows read so far */
070: protected Vector m_Buffer = null;
071:
072: /**
073: * Returns a string describing this Loader
074: *
075: * @return a description of the Loader suitable for
076: * displaying in the explorer/experimenter gui
077: */
078: public String globalInfo() {
079: return "Reads a source that is in libsvm format.\n\n"
080: + "For more information about libsvm see:\n\n"
081: + "http://www.csie.ntu.edu.tw/~cjlin/libsvm/";
082: }
083:
084: /**
085: * Get the file extension used for libsvm files
086: *
087: * @return the file extension
088: */
089: public String getFileExtension() {
090: return FILE_EXTENSION;
091: }
092:
093: /**
094: * Gets all the file extensions used for this type of file
095: *
096: * @return the file extensions
097: */
098: public String[] getFileExtensions() {
099: return new String[] { getFileExtension() };
100: }
101:
102: /**
103: * Returns a description of the file type.
104: *
105: * @return a short file description
106: */
107: public String getFileDescription() {
108: return "libsvm data files";
109: }
110:
111: /**
112: * Resets the Loader ready to read a new data set
113: *
114: * @throws IOException if something goes wrong
115: */
116: public void reset() throws IOException {
117: m_structure = null;
118: m_Buffer = null;
119:
120: setRetrieval(NONE);
121:
122: if ((m_File != null) && (new File(m_File)).isFile()) {
123: setFile(new File(m_File));
124: } else if ((m_URL != null) && !m_URL.equals("http://")) {
125: setURL(m_URL);
126: }
127: }
128:
129: /**
130: * Resets the Loader object and sets the source of the data set to be
131: * the supplied url.
132: *
133: * @param url the source url.
134: * @throws IOException if an error occurs
135: */
136: public void setSource(URL url) throws IOException {
137: m_structure = null;
138: m_Buffer = null;
139:
140: setRetrieval(NONE);
141:
142: setSource(url.openStream());
143:
144: m_URL = url.toString();
145: }
146:
147: /**
148: * Set the url to load from
149: *
150: * @param url the url to load from
151: * @throws IOException if the url can't be set.
152: */
153: public void setURL(String url) throws IOException {
154: m_URL = url;
155: setSource(new URL(url));
156: }
157:
158: /**
159: * Return the current url
160: *
161: * @return the current url
162: */
163: public String retrieveURL() {
164: return m_URL;
165: }
166:
167: /**
168: * Resets the Loader object and sets the source of the data set to be
169: * the supplied InputStream.
170: *
171: * @param in the source InputStream.
172: * @throws IOException if initialization of reader fails.
173: */
174: public void setSource(InputStream in) throws IOException {
175: m_File = (new File(System.getProperty("user.dir")))
176: .getAbsolutePath();
177: m_URL = "http://";
178:
179: m_sourceReader = new BufferedReader(new InputStreamReader(in));
180: }
181:
182: /**
183: * turns a libsvm row into a double array with the class as the last
184: * entry
185: *
186: * @param row the row to turn into a double array
187: * @return the corresponding double array
188: */
189: protected double[] libsvmToArray(String row) {
190: double[] result;
191: StringTokenizer tok;
192: int index;
193: int max;
194: String col;
195: double value;
196:
197: // determine max index
198: max = 0;
199: tok = new StringTokenizer(row, " \t");
200: tok.nextToken(); // skip class
201: while (tok.hasMoreTokens()) {
202: col = tok.nextToken();
203: index = Integer
204: .parseInt(col.substring(0, col.indexOf(":")));
205: if (index > max)
206: max = index;
207: }
208:
209: // read values into array
210: tok = new StringTokenizer(row, " \t");
211: result = new double[max + 1];
212:
213: // 1. class
214: result[result.length - 1] = Double.parseDouble(tok.nextToken());
215:
216: // 2. attributes
217: while (tok.hasMoreTokens()) {
218: col = tok.nextToken();
219: index = Integer
220: .parseInt(col.substring(0, col.indexOf(":")));
221: value = Double.parseDouble(col
222: .substring(col.indexOf(":") + 1));
223: result[index - 1] = value;
224: }
225:
226: return result;
227: }
228:
229: /**
230: * determines the number of attributes, if the number of attributes in the
231: * given row is greater than the current amount then this number will be
232: * returned, otherwise the current number
233: *
234: * @param row row to determine the number of attributes from
235: * @param num the current number of attributes
236: * @return the new number of attributes
237: */
238: protected int determineNumAttributes(String row, int num) {
239: int result;
240: int count;
241:
242: result = num;
243:
244: count = libsvmToArray(row).length;
245: if (count > result)
246: result = count;
247:
248: return result;
249: }
250:
251: /**
252: * Determines and returns (if possible) the structure (internally the
253: * header) of the data set as an empty set of instances.
254: *
255: * @return the structure of the data set as an empty set
256: * of Instances
257: * @throws IOException if an error occurs
258: */
259: public Instances getStructure() throws IOException {
260: StringBuffer line;
261: int cInt;
262: char c;
263: int numAtt;
264: FastVector atts;
265: int i;
266: String relName;
267:
268: if (m_sourceReader == null)
269: throw new IOException("No source has been specified");
270:
271: if (m_structure == null) {
272: m_Buffer = new Vector();
273: try {
274: // determine number of attributes
275: numAtt = 0;
276: line = new StringBuffer();
277: while ((cInt = m_sourceReader.read()) != -1) {
278: c = (char) cInt;
279: if ((c == '\n') || (c == '\r')) {
280: if (line.length() > 0) {
281: m_Buffer
282: .add(libsvmToArray(line.toString()));
283: numAtt = determineNumAttributes(line
284: .toString(), numAtt);
285: }
286: line = new StringBuffer();
287: } else {
288: line.append(c);
289: }
290: }
291:
292: // last line?
293: if (line.length() != 0) {
294: m_Buffer.add(libsvmToArray(line.toString()));
295: numAtt = determineNumAttributes(line.toString(),
296: numAtt);
297: }
298:
299: // generate header
300: atts = new FastVector(numAtt);
301: for (i = 0; i < numAtt - 1; i++)
302: atts.addElement(new Attribute("att_" + (i + 1)));
303: atts.addElement(new Attribute("class"));
304:
305: if (!m_URL.equals("http://"))
306: relName = m_URL;
307: else
308: relName = m_File;
309:
310: m_structure = new Instances(relName, atts, 0);
311: m_structure
312: .setClassIndex(m_structure.numAttributes() - 1);
313: } catch (Exception ex) {
314: throw new IOException(
315: "Unable to determine structure as libsvm.");
316: }
317: }
318:
319: return new Instances(m_structure, 0);
320: }
321:
322: /**
323: * Return the full data set. If the structure hasn't yet been determined
324: * by a call to getStructure then method should do so before processing
325: * the rest of the data set.
326: *
327: * @return the structure of the data set as an empty
328: * set of Instances
329: * @throws IOException if there is no source or parsing fails
330: */
331: public Instances getDataSet() throws IOException {
332: Instances result;
333: double[] sparse;
334: double[] data;
335: int i;
336:
337: if (m_sourceReader == null)
338: throw new IOException("No source has been specified");
339:
340: if (getRetrieval() == INCREMENTAL)
341: throw new IOException(
342: "Cannot mix getting Instances in both incremental and batch modes");
343:
344: setRetrieval(BATCH);
345: if (m_structure == null)
346: getStructure();
347:
348: result = new Instances(m_structure, 0);
349:
350: // create instances from buffered arrays
351: for (i = 0; i < m_Buffer.size(); i++) {
352: sparse = (double[]) m_Buffer.get(i);
353:
354: if (sparse.length != m_structure.numAttributes()) {
355: data = new double[m_structure.numAttributes()];
356: // attributes
357: System.arraycopy(sparse, 0, data, 0, sparse.length - 1);
358: // class
359: data[data.length - 1] = sparse[sparse.length - 1];
360: } else {
361: data = sparse;
362: }
363:
364: result.add(new Instance(1, data));
365: }
366:
367: return result;
368: }
369:
370: /**
371: * LibSVmLoader is unable to process a data set incrementally.
372: *
373: * @param structure ignored
374: * @return never returns without throwing an exception
375: * @throws IOException always. LibSVMLoader is unable to process a
376: * data set incrementally.
377: */
378: public Instance getNextInstance(Instances structure)
379: throws IOException {
380: throw new IOException(
381: "LibSVMLoader can't read data sets incrementally.");
382: }
383:
384: /**
385: * Main method.
386: *
387: * @param args should contain the name of an input file.
388: */
389: public static void main(String[] args) {
390: runFileLoader(new LibSVMLoader(), args);
391: }
392: }
|