001: /*
002: * Read files in Excel comma separated value format.
003: * Copyright (C) 2001-2004 Stephen Ostermiller
004: * http://ostermiller.org/contact.pl?regarding=Java+Utilities
005: *
006: * This program is free software; you can redistribute it and/or modify
007: * it under the terms of the GNU General Public License as published by
008: * the Free Software Foundation; either version 2 of the License, or
009: * (at your option) any later version.
010: *
011: * This program is distributed in the hope that it will be useful,
012: * but WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014: * GNU General Public License for more details.
015: *
016: * See COPYING.TXT for details.
017: */
018:
019: package com.Ostermiller.util;
020:
021: import java.io.*;
022: import java.util.Vector;
023:
024: /**
025: * Read files in comma separated value format as outputted by the Microsoft
026: * Excel Spreadsheet program.
027: * More information about this class is available from <a target="_top" href=
028: * "http://ostermiller.org/utils/ExcelCSV.html">ostermiller.org</a>.
029: * <P>
030: * Excel CSV is a file format used as a portable representation of a database.
031: * Each line is one entry or record and the fields in a record are separated by commas.
032: * <P>
033: * If field includes a comma or a new line, the whole field must be surrounded with double quotes.
034: * When the field is in quotes, any quote literals must be escaped by two quotes ("").
035: * Text that comes after quotes that have been closed but come before the next comma will be ignored.
036: * <P>
037: * Empty fields are returned as as String of length zero: "". The following line has three empty
038: * fields and three non-empty fields in it. There is an empty field on each end, and one in the
039: * middle. One token is returned as a space.<br>
040: * <pre>,second,, ,fifth,</pre>
041: * <P>
042: * Blank lines are always ignored. Other lines will be ignored if they start with a
043: * comment character as set by the setCommentStart() method.
044: * <P>
045: * An example of how CVSLexer might be used:
046: * <pre>
047: * ExcelCSVParser shredder = new ExcelCSVParser(System.in);
048: * String t;
049: * while ((t = shredder.nextValue()) != null){
050: * System.out.println("" + shredder.lastLineNumber() + " " + t);
051: * }
052: * </pre>
053: * <P>
054: * The CSV that Excel outputs differs from the
055: * <a href="http://ostermiller.org/utils/CSVLexer.html">standard</a>
056: * in several respects:
057: * <ul><li>Leading and trailing whitespace is significant.</li>
058: * <li>A backslash is not a special character and is not used to escape anything.</li>
059: * <li>Quotes inside quoted strings are escaped with a double quote rather than a backslash.</li>
060: * <li>Excel may convert data before putting it in CSV format:<ul>
061: * <li>Tabs are converted to a single space.</li>
062: * <li>New lines in the data are always represented as the UNIX new line. ("\n")</li>
063: * <li>Numbers that are greater than 12 digits may be represented in truncated
064: * scientific notation form.</li></ul>
065: * This parser does not attempt to fix these excel conversions, but users should be aware
066: * of them.</li></ul>
067: *
068: * @see com.Ostermiller.util.CSVParser
069: *
070: * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
071: * @since ostermillerutils 1.00.00
072: */
073: public class ExcelCSVParser implements CSVParse {
074:
075: /**
076: * InputStream on which this parser is based.
077: *
078: * @since ostermillerutils 1.02.22
079: */
080: private InputStream inStream;
081:
082: /**
083: * Reader on which this parser is based.
084: *
085: * @since ostermillerutils 1.02.22
086: */
087: private Reader inReader;
088:
089: /**
090: * Does all the dirty work.
091: * Calls for new tokens are routed through
092: * this object.
093: *
094: * @since ostermillerutils 1.00.00
095: */
096: private ExcelCSVLexer lexer;
097:
098: /**
099: * Token cache. Used for when we request a token
100: * from the lexer but can't return it because its
101: * on the next line.
102: *
103: * @since ostermillerutils 1.00.00
104: */
105: private String tokenCache;
106:
107: /**
108: * Line cache. The line number that goes along with
109: * the tokenCache. Not valid if the tokenCache is
110: * null.
111: *
112: * @since ostermillerutils 1.00.00
113: */
114: private int lineCache;
115:
116: /**
117: * The line number the last token came from, or -1 if
118: * no tokens have been returned.
119: *
120: * @since ostermillerutils 1.00.00
121: */
122: private int lastLine = -1;
123:
124: /**
125: * Create a parser to parse delimited values from
126: * an InputStream.
127: *
128: * @param in stream that contains comma separated values.
129: * @param delimiter record separator
130: *
131: * @throws BadDelimiterException if the specified delimiter cannot be used
132: *
133: * @since ostermillerutils 1.02.24
134: */
135: public ExcelCSVParser(InputStream in, char delimiter)
136: throws BadDelimiterException {
137: inStream = in;
138: lexer = new ExcelCSVLexer(in);
139: changeDelimiter(delimiter);
140: }
141:
142: /**
143: * Create a parser to parse comma separated values from
144: * an InputStream.
145: *
146: * @param in stream that contains comma separated values.
147: *
148: * @since ostermillerutils 1.00.00
149: */
150: public ExcelCSVParser(InputStream in) {
151: inStream = in;
152: lexer = new ExcelCSVLexer(in);
153: }
154:
155: /**
156: * Create a parser to parse delimited values from
157: * a Reader.
158: *
159: * @param in reader that contains comma separated values.
160: * @param delimiter record separator
161: *
162: * @throws BadDelimiterException if the specified delimiter cannot be used
163: *
164: * @since ostermillerutils 1.02.24
165: */
166: public ExcelCSVParser(Reader in, char delimiter)
167: throws BadDelimiterException {
168: inReader = in;
169: lexer = new ExcelCSVLexer(in);
170: changeDelimiter(delimiter);
171: }
172:
173: /**
174: * Create a parser to parse comma separated values from
175: * a Reader.
176: *
177: * @param in reader that contains comma separated values.
178: *
179: * @since ostermillerutils 1.00.00
180: */
181: public ExcelCSVParser(Reader in) {
182: inReader = in;
183: lexer = new ExcelCSVLexer(in);
184: }
185:
186: /**
187: * Close any stream upon which this parser is based.
188: *
189: * @since ostermillerutils 1.02.22
190: * @throws IOException if an error occurs while closing the stream.
191: */
192: public void close() throws IOException {
193: if (inStream != null)
194: inStream.close();
195: if (inReader != null)
196: inReader.close();
197: }
198:
199: /**
200: * get the next value.
201: *
202: * @return the next value or null if there are no more values.
203: * @throws IOException if an error occurs while reading.
204: *
205: * @since ostermillerutils 1.00.00
206: */
207: public String nextValue() throws IOException {
208: if (tokenCache == null) {
209: tokenCache = lexer.getNextToken();
210: lineCache = lexer.getLineNumber();
211: }
212: lastLine = lineCache;
213: String result = tokenCache;
214: tokenCache = null;
215: return result;
216: }
217:
218: /**
219: * Get the line number that the last token came from.
220: * <p>
221: * New line breaks that occur in the middle of a token are no
222: * counted in the line number count.
223: *
224: * @return line number or -1 if no tokens have been returned yet.
225: *
226: * @since ostermillerutils 1.00.00
227: */
228: public int lastLineNumber() {
229: return lastLine;
230: }
231:
232: /**
233: * Get all the values from a line.
234: * <p>
235: * If the line has already been partially read, only the
236: * values that have not already been read will be included.
237: *
238: * @return all the values from the line or null if there are no more values.
239: * @throws IOException if an error occurs while reading.
240: *
241: * @since ostermillerutils 1.00.00
242: */
243: public String[] getLine() throws IOException {
244: int lineNumber = -1;
245: Vector<String> v = new Vector<String>();
246: if (tokenCache != null) {
247: v.add(tokenCache);
248: lineNumber = lineCache;
249: }
250: while ((tokenCache = lexer.getNextToken()) != null
251: && (lineNumber == -1 || lexer.getLineNumber() == lineNumber)) {
252: v.add(tokenCache);
253: lineNumber = lexer.getLineNumber();
254: }
255: if (v.size() == 0) {
256: return null;
257: }
258: lastLine = lineNumber;
259: lineCache = lexer.getLineNumber();
260: String[] result = new String[v.size()];
261: return (v.toArray(result));
262: }
263:
264: /**
265: * Get all the values from the file.
266: * <p>
267: * If the file has already been partially read, only the
268: * values that have not already been read will be included.
269: * <p>
270: * Each line of the file that has at least one value will be
271: * represented. Comments and empty lines are ignored.
272: * <p>
273: * The resulting double array may be jagged.
274: *
275: * @return all the values from the file or null if there are no more values.
276: * @throws IOException if an error occurs while reading.
277: *
278: * @since ostermillerutils 1.00.00
279: */
280: public String[][] getAllValues() throws IOException {
281: Vector<String[]> v = new Vector<String[]>();
282: String[] line;
283: while ((line = getLine()) != null) {
284: v.add(line);
285: }
286: if (v.size() == 0) {
287: return null;
288: }
289: String[][] result = new String[v.size()][];
290: return (v.toArray(result));
291: }
292:
293: /**
294: * Change this parser so that it uses a new delimiter.
295: * <p>
296: * The initial character is a comma, the delimiter cannot be changed
297: * to a quote or other character that has special meaning in CSV.
298: *
299: * @param newDelim delimiter to which to switch.
300: * @throws BadDelimiterException if the character cannot be used as a delimiter.
301: *
302: * @since ostermillerutils 1.02.08
303: */
304: public void changeDelimiter(char newDelim)
305: throws BadDelimiterException {
306: lexer.changeDelimiter(newDelim);
307: }
308:
309: /**
310: * Change this parser so that it uses a new character for quoting.
311: * <p>
312: * The initial character is a double quote ("), the delimiter cannot be changed
313: * to a comma or other character that has special meaning in CSV.
314: *
315: * @param newQuote character to use for quoting.
316: * @throws BadQuoteException if the character cannot be used as a quote.
317: *
318: * @since ostermillerutils 1.02.16
319: */
320: public void changeQuote(char newQuote) throws BadQuoteException {
321: lexer.changeQuote(newQuote);
322: }
323:
324: /**
325: * Set the characters that indicate a comment at the beginning of the line.
326: * For example if the string "#;!" were passed in, all of the following lines
327: * would be comments:<br>
328: * <pre> # Comment
329: * ; Another Comment
330: * ! Yet another comment</pre>
331: * By default there are no comments in CVS files. Commas and quotes may not be
332: * used to indicate comment lines.
333: *
334: * @param commentDelims list of characters a comment line may start with.
335: *
336: * @since ostermillerutils 1.00.00
337: */
338: public void setCommentStart(String commentDelims) {
339: lexer.setCommentStart(commentDelims);
340: }
341:
342: /**
343: * Get the number of the line from which the last value was retrieved.
344: *
345: * @return line number or -1 if no tokens have been returned.
346: *
347: * @since ostermillerutils 1.00.00
348: */
349: public int getLastLineNumber() {
350: return lastLine;
351: }
352:
353: /**
354: * Parse the comma delimited data from a string.
355: *
356: * @param s string with comma delimited data to parse.
357: * @return parsed data.
358: *
359: * @since ostermillerutils 1.02.03
360: */
361: public static String[][] parse(String s) {
362: try {
363: return (new ExcelCSVParser(new StringReader(s)))
364: .getAllValues();
365: } catch (IOException x) {
366: return null;
367: }
368: }
369:
370: /**
371: * Parse the delimited data from a string.
372: *
373: * @param s string with delimited data to parse.
374: * @param delimiter record separator
375: * @return parsed data.
376: * @throws BadDelimiterException if the character cannot be used as a delimiter.
377: *
378: * @since ostermillerutils 1.02.24
379: */
380: public static String[][] parse(String s, char delimiter)
381: throws BadDelimiterException {
382: try {
383: return (new ExcelCSVParser(new StringReader(s), delimiter))
384: .getAllValues();
385: } catch (IOException x) {
386: return null;
387: }
388: }
389:
390: /**
391: * Parse the comma delimited data from a stream.
392: *
393: * @param in Reader with comma delimited data to parse.
394: * @return parsed data.
395: * @throws IOException if an error occurs while reading.
396: *
397: * @since ostermillerutils 1.02.03
398: */
399: public static String[][] parse(Reader in) throws IOException {
400: return (new ExcelCSVParser(in)).getAllValues();
401: }
402:
403: /**
404: * Parse the delimited data from a stream.
405: *
406: * @param in Reader with delimited data to parse.
407: * @param delimiter record separator
408: * @return parsed data.
409: * @throws BadDelimiterException if the character cannot be used as a delimiter.
410: * @throws IOException if an error occurs while reading.
411: *
412: * @since ostermillerutils 1.02.24
413: */
414: public static String[][] parse(Reader in, char delimiter)
415: throws IOException, BadDelimiterException {
416: return (new ExcelCSVParser(in, delimiter)).getAllValues();
417: }
418: }
|