001: /*
002: * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
003: * PROPRIETARY/CONFIDENTIAL. Use of this product is subject to license terms.
004: */
005: package com.sun.portal.ubt.report.data.file.parser;
006:
007: import java.io.*;
008: import java.util.Vector;
009:
010: /**
011: * Read files in comma separated value format.
012: * CSV is a file format used as a portable representation of a database.
013: * Each line is one entry or record and the fields in a record are separated by commas.
014: * Commas may be preceded or followed by arbitrary space and/or tab characters which are
015: * ignored.
016: * <P>
017: * If field includes a comma or a new line, the whole field must be surrounded with double quotes.
018: * When the field is in quotes, any quote literals must be escaped by \" Backslash
019: * literals must be escaped by \\. Otherwise a backslash and the character following
020: * will be treated as the following character, IE. "\n" is equivalent to "n". Other escape
021: * sequences may be set using the setEscapes() method. Text that comes after quotes that have
022: * been closed but come before the next comma will be ignored.
023: * <P>
024: * Empty fields are returned as as String of length zero: "". The following line has three empty
025: * fields and three non-empty fields in it. There is an empty field on each end, and one in the
026: * middle. One token is returned as a space.<br>
027: * <pre>,second,," ",fifth,</pre>
028: * <P>
029: * Blank lines are always ignored. Other lines will be ignored if they start with a
030: * comment character as set by the setCommentStart() method.
031: * <P>
032: * An example of how CVSLexer might be used:
033: * <pre>
034: * CSVParser shredder = new CSVParser(System.in);
035: * shredder.setCommentStart("#;!");
036: * shredder.setEscapes("nrtf", "\n\r\t\f");
037: * String t;
038: * while ((t = shredder.nextValue()) != null){
039: * System.out.println("" + shredder.lastLineNumber() + " " + t);
040: * }
041: * </pre>
042: * <P>
043: */
044: public class CSVParser implements CSVParse {
045:
046: /**
047: * InputStream on which this parser is based.
048: *
049: */
050: private InputStream inStream;
051:
052: /**
053: * Reader on which this parser is based.
054: *
055: */
056: private Reader inReader;
057:
058: /**
059: * Does all the dirty work.
060: * Calls for new tokens are routed through
061: * this object.
062: *
063: */
064: private CSVLexer lexer;
065:
066: /**
067: * Token cache. Used for when we request a token
068: * from the lexer but can't return it because its
069: * on the next line.
070: *
071: */
072: private String tokenCache;
073:
074: /**
075: * Line cache. The line number that goes along with
076: * the tokenCache. Not valid if the tokenCache is
077: * null.
078: *
079: */
080: private int lineCache;
081:
082: /**
083: * The line number the last token came from, or -1 if
084: * no tokens have been returned.
085: *
086: */
087: private int lastLine = -1;
088:
089: /**
090: * Create a parser to parse comma separated values from
091: * an InputStream.
092: * <p>
093: * Byte to character conversion is done using the platform
094: * default locale.
095: *
096: * @param in stream that contains comma separated values.
097: *
098: */
099: public CSVParser(InputStream in) {
100: inStream = in;
101: lexer = new CSVLexer(in);
102: }
103:
104: /**
105: * Create a parser to parse delimited values from
106: * an InputStream.
107: * <p>
108: * Byte to character conversion is done using the platform
109: * default locale.
110: *
111: * @param in stream that contains comma separated values.
112: * @param delimiter record separator
113: *
114: * @throws BadDelimiterException if the specified delimiter cannot be used
115: *
116: */
117: public CSVParser(InputStream in, char delimiter)
118: throws BadDelimiterException {
119: inStream = in;
120: lexer = new CSVLexer(in);
121: changeDelimiter(delimiter);
122: }
123:
124: /**
125: * Create a parser to parse comma separated values from
126: * a Reader.
127: *
128: * @param in reader that contains comma separated values.
129: *
130: */
131: public CSVParser(Reader in) {
132: inReader = in;
133: lexer = new CSVLexer(in);
134: }
135:
136: /**
137: * Create a parser to parse delimited values from
138: * a Reader.
139: *
140: * @param in reader that contains comma separated values.
141: * @param delimiter record separator
142: *
143: * @throws BadDelimiterException if the specified delimiter cannot be used
144: *
145: */
146: public CSVParser(Reader in, char delimiter)
147: throws BadDelimiterException {
148: inReader = in;
149: lexer = new CSVLexer(in);
150: changeDelimiter(delimiter);
151: }
152:
153: /**
154: * Create a parser to parse delimited values from
155: * an InputStream.
156: * <p>
157: * Byte to character conversion is done using the platform
158: * default locale.
159: *
160: * @param in stream that contains comma separated values.
161: * @param escapes a list of characters that will represent escape sequences.
162: * @param replacements the list of replacement characters for those escape sequences.
163: * @param commentDelims list of characters a comment line may start with.
164: * @param delimiter record separator
165: *
166: * @throws BadDelimiterException if the specified delimiter cannot be used
167: *
168: */
169: public CSVParser(InputStream in, char delimiter, String escapes,
170: String replacements, String commentDelims)
171: throws BadDelimiterException {
172: inStream = in;
173: lexer = new CSVLexer(in);
174: setEscapes(escapes, replacements);
175: setCommentStart(commentDelims);
176: changeDelimiter(delimiter);
177: }
178:
179: /**
180: * Create a parser to parse comma separated values from
181: * an InputStream.
182: * <p>
183: * Byte to character conversion is done using the platform
184: * default locale.
185: *
186: * @param in stream that contains comma separated values.
187: * @param escapes a list of characters that will represent escape sequences.
188: * @param replacements the list of replacement characters for those escape sequences.
189: * @param commentDelims list of characters a comment line may start with.
190: *
191: */
192: public CSVParser(InputStream in, String escapes,
193: String replacements, String commentDelims) {
194: inStream = in;
195: lexer = new CSVLexer(in);
196: setEscapes(escapes, replacements);
197: setCommentStart(commentDelims);
198: }
199:
200: /**
201: * Create a parser to parse delimited values from
202: * a Reader.
203: *
204: * @param in reader that contains comma separated values.
205: * @param escapes a list of characters that will represent escape sequences.
206: * @param replacements the list of replacement characters for those escape sequences.
207: * @param commentDelims list of characters a comment line may start with.
208: * @param delimiter record separator
209: *
210: * @throws com.sun.portal.ubt.report.data.file.parser.BadDelimiterException if the specified delimiter cannot be used
211: *
212: */
213: public CSVParser(Reader in, char delimiter, String escapes,
214: String replacements, String commentDelims)
215: throws BadDelimiterException {
216: inReader = in;
217: lexer = new CSVLexer(in);
218: setEscapes(escapes, replacements);
219: setCommentStart(commentDelims);
220: changeDelimiter(delimiter);
221: }
222:
223: /**
224: * Create a parser to parse comma separated values from
225: * a Reader.
226: *
227: * @param in reader that contains comma separated values.
228: * @param escapes a list of characters that will represent escape sequences.
229: * @param replacements the list of replacement characters for those escape sequences.
230: * @param commentDelims list of characters a comment line may start with.
231: *
232: */
233: public CSVParser(Reader in, String escapes, String replacements,
234: String commentDelims) {
235: inReader = in;
236: lexer = new CSVLexer(in);
237: setEscapes(escapes, replacements);
238: setCommentStart(commentDelims);
239: }
240:
241: /**
242: * Close any stream upon which this parser is based.
243: *
244: * @throws IOException if an error occurs while closing the stream.
245: */
246: public void close() throws IOException {
247: if (inStream != null)
248: inStream.close();
249: if (inReader != null)
250: inReader.close();
251: }
252:
253: /**
254: * get the next value.
255: *
256: * @return the next value or null if there are no more values.
257: * @throws IOException if an error occurs while reading.
258: *
259: */
260: public String nextValue() throws IOException {
261: if (tokenCache == null) {
262: tokenCache = lexer.getNextToken();
263: lineCache = lexer.getLineNumber();
264: }
265: lastLine = lineCache;
266: String result = tokenCache;
267: tokenCache = null;
268: return result;
269: }
270:
271: /**
272: * Get the line number that the last token came from.
273: * <p>
274: * New line breaks that occur in the middle of a token are no
275: * counted in the line number count.
276: *
277: * @return line number or -1 if no tokens have been returned yet.
278: *
279: */
280: public int lastLineNumber() {
281: return lastLine;
282: }
283:
284: /**
285: * Get all the values from a line.
286: * <p>
287: * If the line has already been partially read, only the
288: * values that have not already been read will be included.
289: *
290: * @return all the values from the line or null if there are no more values.
291: * @throws IOException if an error occurs while reading.
292: *
293: */
294: public String[] getLine() throws IOException {
295: int lineNumber = -1;
296: Vector v = new Vector();
297: if (tokenCache != null) {
298: v.add(tokenCache);
299: lineNumber = lineCache;
300: }
301: while ((tokenCache = lexer.getNextToken()) != null
302: && (lineNumber == -1 || lexer.getLineNumber() == lineNumber)) {
303: v.add(tokenCache);
304: lineNumber = lexer.getLineNumber();
305: }
306: if (v.size() == 0) {
307: return null;
308: }
309: lastLine = lineNumber;
310: lineCache = lexer.getLineNumber();
311: String[] result = new String[v.size()];
312: return ((String[]) v.toArray(result));
313: }
314:
315: /**
316: * Get all the values from the file.
317: * <p>
318: * If the file has already been partially read, only the
319: * values that have not already been read will be included.
320: * <p>
321: * Each line of the file that has at least one value will be
322: * represented. Comments and empty lines are ignored.
323: * <p>
324: * The resulting double array may be jagged.
325: *
326: * @return all the values from the file or null if there are no more values.
327: * @throws IOException if an error occurs while reading.
328: *
329: */
330: public String[][] getAllValues() throws IOException {
331: Vector v = new Vector();
332: String[] line;
333: while ((line = getLine()) != null) {
334: v.add(line);
335: }
336: if (v.size() == 0) {
337: return null;
338: }
339: String[][] result = new String[v.size()][];
340: return ((String[][]) v.toArray(result));
341: }
342:
343: /**
344: * Specify escape sequences and their replacements.
345: * Escape sequences set here are in addition to \\ and \".
346: * \\ and \" are always valid escape sequences. This method
347: * allows standard escape sequenced to be used. For example
348: * "\n" can be set to be a newline rather than an 'n'.
349: * A common way to call this method might be:<br>
350: * <code>setEscapes("nrtf", "\n\r\t\f");</code><br>
351: * which would set the escape sequences to be the Java escape
352: * sequences. Characters that follow a \ that are not escape
353: * sequences will still be interpreted as that character.<br>
354: * The two arguments to this method must be the same length. If
355: * they are not, the longer of the two will be truncated.
356: *
357: * @param escapes a list of characters that will represent escape sequences.
358: * @param replacements the list of replacement characters for those escape sequences.
359: *
360: */
361: public void setEscapes(String escapes, String replacements) {
362: lexer.setEscapes(escapes, replacements);
363: }
364:
365: /**
366: * Change this parser so that it uses a new delimiter.
367: * <p>
368: * The initial character is a comma, the delimiter cannot be changed
369: * to a quote or other character that has special meaning in CSV.
370: *
371: * @param newDelim delimiter to which to switch.
372: * @throws BadDelimiterException if the character cannot be used as a delimiter.
373: *
374: */
375: public void changeDelimiter(char newDelim)
376: throws BadDelimiterException {
377: lexer.changeDelimiter(newDelim);
378: }
379:
380: /**
381: * Change this parser so that it uses a new character for quoting.
382: * <p>
383: * The initial character is a double quote ("), the delimiter cannot be changed
384: * to a comma or other character that has special meaning in CSV.
385: *
386: * @param newQuote character to use for quoting.
387: * @throws BadQuoteException if the character cannot be used as a quote.
388: *
389: */
390: public void changeQuote(char newQuote) throws BadQuoteException {
391: lexer.changeQuote(newQuote);
392: }
393:
394: /**
395: * Set the characters that indicate a comment at the beginning of the line.
396: * For example if the string "#;!" were passed in, all of the following lines
397: * would be comments:<br>
398: * <pre> # Comment
399: * ; Another Comment
400: * ! Yet another comment</pre>
401: * By default there are no comments in CVS files. Commas and quotes may not be
402: * used to indicate comment lines.
403: *
404: * @param commentDelims list of characters a comment line may start with.
405: *
406: */
407: public void setCommentStart(String commentDelims) {
408: lexer.setCommentStart(commentDelims);
409: }
410:
411: /**
412: * Get the number of the line from which the last value was retrieved.
413: *
414: * @return line number or -1 if no tokens have been returned.
415: *
416: */
417: public int getLastLineNumber() {
418: return lastLine;
419: }
420:
421: /**
422: * Parse the given file for comma separated values and print the results
423: * to System.out.
424: *
425: * @param args First argument is the file name. System.in used if no filename given.
426: *
427: */
428: private static void main(String[] args) {
429: InputStream in;
430: try {
431: if (args.length > 0) {
432: File f = new File(args[0]);
433: if (f.exists()) {
434: if (f.canRead()) {
435: in = new FileInputStream(f);
436: } else {
437: throw new IOException("Could not open "
438: + args[0]);
439: }
440: } else {
441: throw new IOException("Could not find " + args[0]);
442: }
443: } else {
444: in = System.in;
445: }
446: CSVParser p = new CSVParser(in);
447: p.setCommentStart("#;!");
448: p.setEscapes("nrtf", "\n\r\t\f");
449: String[] t;
450: while ((t = p.getLine()) != null) {
451: for (int i = 0; i < t.length; i++) {
452: System.out.print('"' + t[i] + '"');
453: if (i < t.length - 1) {
454: System.out.print(", ");
455: }
456: }
457: System.out.println();
458: }
459: } catch (IOException e) {
460: System.out.println(e.getMessage());
461: }
462: }
463:
464: /**
465: * Parse the comma delimited data from a string.
466: * <p>
467: * Only escaped backslashes and quotes will be recognized as escape sequences.
468: * The data will be treated as having no comments.
469: *
470: * @param s string with comma delimited data to parse.
471: * @return parsed data.
472: *
473: */
474: public static String[][] parse(String s) {
475: try {
476: return (new CSVParser(new StringReader(s))).getAllValues();
477: } catch (IOException x) {
478: return null;
479: }
480: }
481:
482: /**
483: * Parse the delimited data from a string.
484: * <p>
485: * Only escaped backslashes and quotes will be recognized as escape sequences.
486: * The data will be treated as having no comments.
487: *
488: * @param s string with delimited data to parse.
489: * @param delimiter record separator
490: * @return parsed data.
491: * @throws BadDelimiterException if the character cannot be used as a delimiter.
492: *
493: */
494: public static String[][] parse(String s, char delimiter)
495: throws BadDelimiterException {
496: try {
497: return (new CSVParser(new StringReader(s), delimiter))
498: .getAllValues();
499: } catch (IOException x) {
500: return null;
501: }
502: }
503:
504: /**
505: * Parse the comma delimited data from a string.
506: * Escaped backslashes and quotes will always recognized as escape sequences.
507: *
508: * @param s string with comma delimited data to parse.
509: * @param escapes a list of additional characters that will represent escape sequences.
510: * @param replacements the list of replacement characters for those escape sequences.
511: * @param commentDelims list of characters a comment line may start with.
512: * @return parsed data.
513: *
514: */
515: public static String[][] parse(String s, String escapes,
516: String replacements, String commentDelims) {
517: try {
518: return (new CSVParser(new StringReader(s), escapes,
519: replacements, commentDelims)).getAllValues();
520: } catch (IOException x) {
521: return null;
522: }
523: }
524:
525: /**
526: * Parse the delimited data from a string.
527: * Escaped backslashes and quotes will always recognized as escape sequences.
528: *
529: * @param s string with delimited data to parse.
530: * @param escapes a list of additional characters that will represent escape sequences.
531: * @param replacements the list of replacement characters for those escape sequences.
532: * @param commentDelims list of characters a comment line may start with.
533: * @param delimiter record separator
534: * @return parsed data.
535: * @throws BadDelimiterException if the character cannot be used as a delimiter.
536: *
537: */
538: public static String[][] parse(String s, char delimiter,
539: String escapes, String replacements, String commentDelims)
540: throws BadDelimiterException {
541: try {
542: return (new CSVParser(new StringReader(s), delimiter,
543: escapes, replacements, commentDelims))
544: .getAllValues();
545: } catch (IOException x) {
546: return null;
547: }
548: }
549:
550: /**
551: * Parse the comma delimited data from a stream.
552: * <p>
553: * Only escaped backslashes and quotes will be recognized as escape sequences.
554: * The data will be treated as having no comments.
555: *
556: * @param in Reader with comma delimited data to parse.
557: * @param delimiter record separator
558: * @return parsed data.
559: * @throws com.sun.portal.ubt.report.data.file.parser.BadDelimiterException if the character cannot be used as a delimiter.
560: * @throws IOException if an error occurs while reading.
561: *
562: */
563: public static String[][] parse(Reader in, char delimiter)
564: throws IOException, BadDelimiterException {
565: return (new CSVParser(in, delimiter)).getAllValues();
566: }
567:
568: /**
569: * Parse the delimited data from a stream.
570: * <p>
571: * Only escaped backslashes and quotes will be recognized as escape sequences.
572: * The data will be treated as having no comments.
573: *
574: * @param in Reader with comma delimited data to parse.
575: * @return parsed data.
576: * @throws IOException if an error occurs while reading.
577: *
578: */
579: public static String[][] parse(Reader in) throws IOException {
580: return (new CSVParser(in)).getAllValues();
581: }
582:
583: /**
584: * Parse the delimited data from a stream.
585: * Escaped backslashes and quotes will always recognized as escape sequences.
586: *
587: * @param in Reader with delimited data to parse.
588: * @param delimiter record separator
589: * @param escapes a list of additional characters that will represent escape sequences.
590: * @param replacements the list of replacement characters for those escape sequences.
591: * @param commentDelims list of characters a comment line may start with.
592: * @return parsed data.
593: * @throws BadDelimiterException if the character cannot be used as a delimiter.
594: * @throws IOException if an error occurs while reading.
595: *
596: */
597: public static String[][] parse(Reader in, char delimiter,
598: String escapes, String replacements, String commentDelims)
599: throws IOException, BadDelimiterException {
600: return (new CSVParser(in, delimiter, escapes, replacements,
601: commentDelims)).getAllValues();
602: }
603:
604: /**
605: * Parse the comma delimited data from a stream.
606: * Escaped backslashes and quotes will always recognized as escape sequences.
607: *
608: * @param in Reader with comma delimited data to parse.
609: * @param escapes a list of additional characters that will represent escape sequences.
610: * @param replacements the list of replacement characters for those escape sequences.
611: * @param commentDelims list of characters a comment line may start with.
612: * @return parsed data.
613: * @throws IOException if an error occurs while reading.
614: *
615: */
616: public static String[][] parse(Reader in, String escapes,
617: String replacements, String commentDelims)
618: throws IOException {
619: return (new CSVParser(in, escapes, replacements, commentDelims))
620: .getAllValues();
621: }
622: }
|