001: /*
002: * Read files in comma separated value format.
003: * Copyright (C) 2001-2004 Stephen Ostermiller
004: * http://ostermiller.org/contact.pl?regarding=Java+Utilities
005: *
006: * This program is free software; you can redistribute it and/or modify
007: * it under the terms of the GNU General Public License as published by
008: * the Free Software Foundation; either version 2 of the License, or
009: * (at your option) any later version.
010: *
011: * This program is distributed in the hope that it will be useful,
012: * but WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014: * GNU General Public License for more details.
015: *
016: * See COPYING.TXT for details.
017: */
018:
019: package com.Ostermiller.util;
020:
021: import java.io.*;
022: import java.util.*;
023:
024: /**
025: * Read files in comma separated value format.
026: * More information about this class is available from <a target="_top" href=
027: * "http://ostermiller.org/utils/CSVLexer.html">ostermiller.org</a>.
028: *
029: * CSV is a file format used as a portable representation of a database.
030: * Each line is one entry or record and the fields in a record are separated by commas.
031: * Commas may be preceded or followed by arbitrary space and/or tab characters which are
032: * ignored.
033: * <P>
034: * If field includes a comma or a new line, the whole field must be surrounded with double quotes.
035: * When the field is in quotes, any quote literals must be escaped by \" Backslash
036: * literals must be escaped by \\. Otherwise a backslash and the character following
037: * will be treated as the following character, IE. "\n" is equivalent to "n". Other escape
038: * sequences may be set using the setEscapes() method. Text that comes after quotes that have
039: * been closed but come before the next comma will be ignored.
040: * <P>
041: * Empty fields are returned as as String of length zero: "". The following line has three empty
042: * fields and three non-empty fields in it. There is an empty field on each end, and one in the
043: * middle. One token is returned as a space.<br>
044: * <pre>,second,," ",fifth,</pre>
045: * <P>
046: * Blank lines are always ignored. Other lines will be ignored if they start with a
047: * comment character as set by the setCommentStart() method.
048: * <P>
049: * An example of how CVSLexer might be used:
050: * <pre>
051: * CSVParser shredder = new CSVParser(System.in);
052: * shredder.setCommentStart("#;!");
053: * shredder.setEscapes("nrtf", "\n\r\t\f");
054: * String t;
055: * while ((t = shredder.nextValue()) != null){
056: * System.out.println("" + shredder.lastLineNumber() + " " + t);
057: * }
058: * </pre>
059: * <P>
060: * Some applications do not output CSV according to the generally accepted standards and this parse may
061: * not be able to handle it. One such application is the Microsoft Excel spreadsheet. A
062: * separate class must be use to read
063: * <a href="http://ostermiller.org/utils/ExcelCSV.html">Excel CSV</a>.
064: *
065: * @see com.Ostermiller.util.ExcelCSVParser
066: *
067: * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
068: * @since ostermillerutils 1.00.00
069: */
070: public class CSVParser implements CSVParse {
071:
072: /**
073: * InputStream on which this parser is based.
074: *
075: * @since ostermillerutils 1.02.22
076: */
077: private InputStream inStream;
078:
079: /**
080: * Reader on which this parser is based.
081: *
082: * @since ostermillerutils 1.02.22
083: */
084: private Reader inReader;
085:
086: /**
087: * Does all the dirty work.
088: * Calls for new tokens are routed through
089: * this object.
090: *
091: * @since ostermillerutils 1.00.00
092: */
093: private CSVLexer lexer;
094:
095: /**
096: * Token cache. Used for when we request a token
097: * from the lexer but can't return it because its
098: * on the next line.
099: *
100: * @since ostermillerutils 1.00.00
101: */
102: private String tokenCache;
103:
104: /**
105: * Line cache. The line number that goes along with
106: * the tokenCache. Not valid if the tokenCache is
107: * null.
108: *
109: * @since ostermillerutils 1.00.00
110: */
111: private int lineCache;
112:
113: /**
114: * The line number the last token came from, or -1 if
115: * no tokens have been returned.
116: *
117: * @since ostermillerutils 1.00.00
118: */
119: private int lastLine = -1;
120:
121: /**
122: * Create a parser to parse comma separated values from
123: * an InputStream.
124: * <p>
125: * Byte to character conversion is done using the platform
126: * default locale.
127: *
128: * @param in stream that contains comma separated values.
129: *
130: * @since ostermillerutils 1.00.00
131: */
132: public CSVParser(InputStream in) {
133: inStream = in;
134: lexer = new CSVLexer(in);
135: }
136:
137: /**
138: * Create a parser to parse delimited values from
139: * an InputStream.
140: * <p>
141: * Byte to character conversion is done using the platform
142: * default locale.
143: *
144: * @param in stream that contains comma separated values.
145: * @param delimiter record separator
146: *
147: * @throws BadDelimiterException if the specified delimiter cannot be used
148: *
149: * @since ostermillerutils 1.02.24
150: */
151: public CSVParser(InputStream in, char delimiter)
152: throws BadDelimiterException {
153: inStream = in;
154: lexer = new CSVLexer(in);
155: changeDelimiter(delimiter);
156: }
157:
158: /**
159: * Create a parser to parse comma separated values from
160: * a Reader.
161: *
162: * @param in reader that contains comma separated values.
163: *
164: * @since ostermillerutils 1.00.00
165: */
166: public CSVParser(Reader in) {
167: inReader = in;
168: lexer = new CSVLexer(in);
169: }
170:
171: /**
172: * Create a parser to parse delimited values from
173: * a Reader.
174: *
175: * @param in reader that contains comma separated values.
176: * @param delimiter record separator
177: *
178: * @throws BadDelimiterException if the specified delimiter cannot be used
179: *
180: * @since ostermillerutils 1.02.24
181: */
182: public CSVParser(Reader in, char delimiter)
183: throws BadDelimiterException {
184: inReader = in;
185: lexer = new CSVLexer(in);
186: changeDelimiter(delimiter);
187: }
188:
189: /**
190: * Create a parser to parse delimited values from
191: * an InputStream.
192: * <p>
193: * Byte to character conversion is done using the platform
194: * default locale.
195: *
196: * @param in stream that contains comma separated values.
197: * @param escapes a list of characters that will represent escape sequences.
198: * @param replacements the list of replacement characters for those escape sequences.
199: * @param commentDelims list of characters a comment line may start with.
200: * @param delimiter record separator
201: *
202: * @throws BadDelimiterException if the specified delimiter cannot be used
203: *
204: * @since ostermillerutils 1.02.24
205: */
206: public CSVParser(InputStream in, char delimiter, String escapes,
207: String replacements, String commentDelims)
208: throws BadDelimiterException {
209: inStream = in;
210: lexer = new CSVLexer(in);
211: setEscapes(escapes, replacements);
212: setCommentStart(commentDelims);
213: changeDelimiter(delimiter);
214: }
215:
216: /**
217: * Create a parser to parse comma separated values from
218: * an InputStream.
219: * <p>
220: * Byte to character conversion is done using the platform
221: * default locale.
222: *
223: * @param in stream that contains comma separated values.
224: * @param escapes a list of characters that will represent escape sequences.
225: * @param replacements the list of replacement characters for those escape sequences.
226: * @param commentDelims list of characters a comment line may start with.
227: *
228: * @since ostermillerutils 1.00.00
229: */
230: public CSVParser(InputStream in, String escapes,
231: String replacements, String commentDelims) {
232: inStream = in;
233: lexer = new CSVLexer(in);
234: setEscapes(escapes, replacements);
235: setCommentStart(commentDelims);
236: }
237:
238: /**
239: * Create a parser to parse delimited values from
240: * a Reader.
241: *
242: * @param in reader that contains comma separated values.
243: * @param escapes a list of characters that will represent escape sequences.
244: * @param replacements the list of replacement characters for those escape sequences.
245: * @param commentDelims list of characters a comment line may start with.
246: * @param delimiter record separator
247: *
248: * @throws BadDelimiterException if the specified delimiter cannot be used
249: *
250: * @since ostermillerutils 1.02.24
251: */
252: public CSVParser(Reader in, char delimiter, String escapes,
253: String replacements, String commentDelims)
254: throws BadDelimiterException {
255: inReader = in;
256: lexer = new CSVLexer(in);
257: setEscapes(escapes, replacements);
258: setCommentStart(commentDelims);
259: changeDelimiter(delimiter);
260: }
261:
262: /**
263: * Create a parser to parse comma separated values from
264: * a Reader.
265: *
266: * @param in reader that contains comma separated values.
267: * @param escapes a list of characters that will represent escape sequences.
268: * @param replacements the list of replacement characters for those escape sequences.
269: * @param commentDelims list of characters a comment line may start with.
270: *
271: * @since ostermillerutils 1.00.00
272: */
273: public CSVParser(Reader in, String escapes, String replacements,
274: String commentDelims) {
275: inReader = in;
276: lexer = new CSVLexer(in);
277: setEscapes(escapes, replacements);
278: setCommentStart(commentDelims);
279: }
280:
281: /**
282: * Close any stream upon which this parser is based.
283: *
284: * @since ostermillerutils 1.02.22
285: * @throws IOException if an error occurs while closing the stream.
286: */
287: public void close() throws IOException {
288: if (inStream != null)
289: inStream.close();
290: if (inReader != null)
291: inReader.close();
292: }
293:
294: /**
295: * get the next value.
296: *
297: * @return the next value or null if there are no more values.
298: * @throws IOException if an error occurs while reading.
299: *
300: * @since ostermillerutils 1.00.00
301: */
302: public String nextValue() throws IOException {
303: if (tokenCache == null) {
304: tokenCache = lexer.getNextToken();
305: lineCache = lexer.getLineNumber();
306: }
307: lastLine = lineCache;
308: String result = tokenCache;
309: tokenCache = null;
310: return result;
311: }
312:
313: /**
314: * Get the line number that the last token came from.
315: * <p>
316: * New line breaks that occur in the middle of a token are no
317: * counted in the line number count.
318: *
319: * @return line number or -1 if no tokens have been returned yet.
320: *
321: * @since ostermillerutils 1.00.00
322: */
323: public int lastLineNumber() {
324: return lastLine;
325: }
326:
327: /**
328: * Get all the values from a line.
329: * <p>
330: * If the line has already been partially read, only the
331: * values that have not already been read will be included.
332: *
333: * @return all the values from the line or null if there are no more values.
334: * @throws IOException if an error occurs while reading.
335: *
336: * @since ostermillerutils 1.00.00
337: */
338: public String[] getLine() throws IOException {
339: int lineNumber = -1;
340: ArrayList<String> v = new ArrayList<String>();
341: if (tokenCache != null) {
342: v.add(tokenCache);
343: lineNumber = lineCache;
344: }
345: while ((tokenCache = lexer.getNextToken()) != null
346: && (lineNumber == -1 || lexer.getLineNumber() == lineNumber)) {
347: v.add(tokenCache);
348: lineNumber = lexer.getLineNumber();
349: }
350: if (v.size() == 0) {
351: return null;
352: }
353: lastLine = lineNumber;
354: lineCache = lexer.getLineNumber();
355: String[] result = new String[v.size()];
356: return v.toArray(result);
357: }
358:
359: /**
360: * Get all the values from the file.
361: * <p>
362: * If the file has already been partially read, only the
363: * values that have not already been read will be included.
364: * <p>
365: * Each line of the file that has at least one value will be
366: * represented. Comments and empty lines are ignored.
367: * <p>
368: * The resulting double array may be jagged.
369: *
370: * @return all the values from the file or null if there are no more values.
371: * @throws IOException if an error occurs while reading.
372: *
373: * @since ostermillerutils 1.00.00
374: */
375: public String[][] getAllValues() throws IOException {
376: ArrayList<String[]> v = new ArrayList<String[]>();
377: String[] line;
378: while ((line = getLine()) != null) {
379: v.add(line);
380: }
381: if (v.size() == 0) {
382: return null;
383: }
384: String[][] result = new String[v.size()][];
385: return v.toArray(result);
386: }
387:
388: /**
389: * Specify escape sequences and their replacements.
390: * Escape sequences set here are in addition to \\ and \".
391: * \\ and \" are always valid escape sequences. This method
392: * allows standard escape sequenced to be used. For example
393: * "\n" can be set to be a newline rather than an 'n'.
394: * A common way to call this method might be:<br>
395: * <code>setEscapes("nrtf", "\n\r\t\f");</code><br>
396: * which would set the escape sequences to be the Java escape
397: * sequences. Characters that follow a \ that are not escape
398: * sequences will still be interpreted as that character.<br>
399: * The two arguments to this method must be the same length. If
400: * they are not, the longer of the two will be truncated.
401: *
402: * @param escapes a list of characters that will represent escape sequences.
403: * @param replacements the list of replacement characters for those escape sequences.
404: *
405: * @since ostermillerutils 1.00.00
406: */
407: public void setEscapes(String escapes, String replacements) {
408: lexer.setEscapes(escapes, replacements);
409: }
410:
411: /**
412: * Change this parser so that it uses a new delimiter.
413: * <p>
414: * The initial character is a comma, the delimiter cannot be changed
415: * to a quote or other character that has special meaning in CSV.
416: *
417: * @param newDelim delimiter to which to switch.
418: * @throws BadDelimiterException if the character cannot be used as a delimiter.
419: *
420: * @since ostermillerutils 1.02.08
421: */
422: public void changeDelimiter(char newDelim)
423: throws BadDelimiterException {
424: lexer.changeDelimiter(newDelim);
425: }
426:
427: /**
428: * Change this parser so that it uses a new character for quoting.
429: * <p>
430: * The initial character is a double quote ("), the delimiter cannot be changed
431: * to a comma or other character that has special meaning in CSV.
432: *
433: * @param newQuote character to use for quoting.
434: * @throws BadQuoteException if the character cannot be used as a quote.
435: *
436: * @since ostermillerutils 1.02.16
437: */
438: public void changeQuote(char newQuote) throws BadQuoteException {
439: lexer.changeQuote(newQuote);
440: }
441:
442: /**
443: * Set the characters that indicate a comment at the beginning of the line.
444: * For example if the string "#;!" were passed in, all of the following lines
445: * would be comments:<br>
446: * <pre> # Comment
447: * ; Another Comment
448: * ! Yet another comment</pre>
449: * By default there are no comments in CVS files. Commas and quotes may not be
450: * used to indicate comment lines.
451: *
452: * @param commentDelims list of characters a comment line may start with.
453: *
454: * @since ostermillerutils 1.00.00
455: */
456: public void setCommentStart(String commentDelims) {
457: lexer.setCommentStart(commentDelims);
458: }
459:
460: /**
461: * Get the number of the line from which the last value was retrieved.
462: *
463: * @return line number or -1 if no tokens have been returned.
464: *
465: * @since ostermillerutils 1.00.00
466: */
467: public int getLastLineNumber() {
468: return lastLine;
469: }
470:
471: /**
472: * Parse the comma delimited data from a string.
473: * <p>
474: * Only escaped backslashes and quotes will be recognized as escape sequences.
475: * The data will be treated as having no comments.
476: *
477: * @param s string with comma delimited data to parse.
478: * @return parsed data.
479: *
480: * @since ostermillerutils 1.02.03
481: */
482: public static String[][] parse(String s) {
483: try {
484: return (new CSVParser(new StringReader(s))).getAllValues();
485: } catch (IOException x) {
486: return null;
487: }
488: }
489:
490: /**
491: * Parse the delimited data from a string.
492: * <p>
493: * Only escaped backslashes and quotes will be recognized as escape sequences.
494: * The data will be treated as having no comments.
495: *
496: * @param s string with delimited data to parse.
497: * @param delimiter record separator
498: * @return parsed data.
499: * @throws BadDelimiterException if the character cannot be used as a delimiter.
500: *
501: * @since ostermillerutils 1.02.24
502: */
503: public static String[][] parse(String s, char delimiter)
504: throws BadDelimiterException {
505: try {
506: return (new CSVParser(new StringReader(s), delimiter))
507: .getAllValues();
508: } catch (IOException x) {
509: return null;
510: }
511: }
512:
513: /**
514: * Parse the comma delimited data from a string.
515: * Escaped backslashes and quotes will always recognized as escape sequences.
516: *
517: * @param s string with comma delimited data to parse.
518: * @param escapes a list of additional characters that will represent escape sequences.
519: * @param replacements the list of replacement characters for those escape sequences.
520: * @param commentDelims list of characters a comment line may start with.
521: * @return parsed data.
522: *
523: * @since ostermillerutils 1.02.03
524: */
525: public static String[][] parse(String s, String escapes,
526: String replacements, String commentDelims) {
527: try {
528: return (new CSVParser(new StringReader(s), escapes,
529: replacements, commentDelims)).getAllValues();
530: } catch (IOException x) {
531: return null;
532: }
533: }
534:
535: /**
536: * Parse the delimited data from a string.
537: * Escaped backslashes and quotes will always recognized as escape sequences.
538: *
539: * @param s string with delimited data to parse.
540: * @param escapes a list of additional characters that will represent escape sequences.
541: * @param replacements the list of replacement characters for those escape sequences.
542: * @param commentDelims list of characters a comment line may start with.
543: * @param delimiter record separator
544: * @return parsed data.
545: * @throws BadDelimiterException if the character cannot be used as a delimiter.
546: *
547: * @since ostermillerutils 1.02.24
548: */
549: public static String[][] parse(String s, char delimiter,
550: String escapes, String replacements, String commentDelims)
551: throws BadDelimiterException {
552: try {
553: return (new CSVParser(new StringReader(s), delimiter,
554: escapes, replacements, commentDelims))
555: .getAllValues();
556: } catch (IOException x) {
557: return null;
558: }
559: }
560:
561: /**
562: * Parse the comma delimited data from a stream.
563: * <p>
564: * Only escaped backslashes and quotes will be recognized as escape sequences.
565: * The data will be treated as having no comments.
566: *
567: * @param in Reader with comma delimited data to parse.
568: * @param delimiter record separator
569: * @return parsed data.
570: * @throws BadDelimiterException if the character cannot be used as a delimiter.
571: * @throws IOException if an error occurs while reading.
572: *
573: * @since ostermillerutils 1.02.24
574: */
575: public static String[][] parse(Reader in, char delimiter)
576: throws IOException, BadDelimiterException {
577: return (new CSVParser(in, delimiter)).getAllValues();
578: }
579:
580: /**
581: * Parse the delimited data from a stream.
582: * <p>
583: * Only escaped backslashes and quotes will be recognized as escape sequences.
584: * The data will be treated as having no comments.
585: *
586: * @param in Reader with comma delimited data to parse.
587: * @return parsed data.
588: * @throws IOException if an error occurs while reading.
589: *
590: * @since ostermillerutils 1.02.03
591: */
592: public static String[][] parse(Reader in) throws IOException {
593: return (new CSVParser(in)).getAllValues();
594: }
595:
596: /**
597: * Parse the delimited data from a stream.
598: * Escaped backslashes and quotes will always recognized as escape sequences.
599: *
600: * @param in Reader with delimited data to parse.
601: * @param delimiter record separator
602: * @param escapes a list of additional characters that will represent escape sequences.
603: * @param replacements the list of replacement characters for those escape sequences.
604: * @param commentDelims list of characters a comment line may start with.
605: * @return parsed data.
606: * @throws BadDelimiterException if the character cannot be used as a delimiter.
607: * @throws IOException if an error occurs while reading.
608: *
609: * @since ostermillerutils 1.02.24
610: */
611: public static String[][] parse(Reader in, char delimiter,
612: String escapes, String replacements, String commentDelims)
613: throws IOException, BadDelimiterException {
614: return (new CSVParser(in, delimiter, escapes, replacements,
615: commentDelims)).getAllValues();
616: }
617:
618: /**
619: * Parse the comma delimited data from a stream.
620: * Escaped backslashes and quotes will always recognized as escape sequences.
621: *
622: * @param in Reader with comma delimited data to parse.
623: * @param escapes a list of additional characters that will represent escape sequences.
624: * @param replacements the list of replacement characters for those escape sequences.
625: * @param commentDelims list of characters a comment line may start with.
626: * @return parsed data.
627: * @throws IOException if an error occurs while reading.
628: *
629: * @since ostermillerutils 1.02.03
630: */
631: public static String[][] parse(Reader in, String escapes,
632: String replacements, String commentDelims)
633: throws IOException {
634: return (new CSVParser(in, escapes, replacements, commentDelims))
635: .getAllValues();
636: }
637: }
|