001: package com.bm.utils.csv;
002:
003: import java.io.IOException;
004: import java.io.InputStream;
005: import java.io.Reader;
006:
007: /**
008: * CSV is a file format used as a portable representation of a database. Each
009: * line is one entry or record and the fields in a record are separated by
010: * commas. Commas may be preceded or followed by arbitrary space and/or tab
011: * characters which are ignored.
012: * <P>
013: * If field includes a comma or a new line, the whole field must be surrounded
014: * with double quotes. When the field is in quotes, any quote literals must be
015: * escaped by \" Backslash literals must be escaped by \\. Otherwise a backslash
016: * and the character following will be treated as the following character, IE.
017: * "\n" is equivalent to "n". Other escape sequences may be set using the
018: * setEscapes() method. Text that comes after quotes that have been closed but
019: * come before the next comma will be ignored.
020: * <P>
021: * Empty fields are returned as as String of length zero: "". The following line
022: * has three empty fields and three non-empty fields in it. There is an empty
023: * field on each end, and one in the middle. One token is returned as a space.<br>
024: *
025: * <pre>
026: * ,second,," ",fifth,
027: * </pre>
028: *
029: * <P>
030: * Blank lines are always ignored. Other lines will be ignored if they start
031: * with a comment character as set by the setCommentStart() method.
032: * <P>
033: * An example of how CVSLexer might be used:
034: *
035: * <pre>
036: * CSVParser shredder = new CSVParser(System.in);
037: * shredder.setCommentStart("#;!");
038: * shredder.setEscapes("nrtf", "\n\r\t\f");
039: * String t;
040: * while ((t = shredder.nextValue()) != null) {
041: * System.out.println("" + shredder.lastLineNumber() + " " + t);
042: * }
043: * </pre>
044: *
045: * <P>
046: * Some applications do not output CSV according to the generally accepted
047: * standards and this parse may not be able to handle it. One such application
048: * is the Microsoft Excel spreadsheet. A separate class must be use to read
049: *
050: * @author Daniel Wiese
051: * @since 17.04.2006
052: */
053: public class CSVParser implements CSVParse {
054:
055: /**
056: * InputStream on which this parser is based.
057: *
058: */
059: private InputStream inStream;
060:
061: /**
062: * Reader on which this parser is based.
063: *
064: */
065: private Reader inReader;
066:
067: /**
068: * Does all the dirty work. Calls for new tokens are routed through this
069: * object.
070: *
071: */
072: private CSVLexer lexer;
073:
074: /**
075: * Token cache. Used for when we request a token from the lexer but can't
076: * return it because its on the next line.
077: *
078: */
079: private String tokenCache;
080:
081: /**
082: * Line cache. The line number that goes along with the tokenCache. Not
083: * valid if the tokenCache is null.
084: *
085: */
086: private int lineCache;
087:
088: /**
089: * The line number the last token came from, or -1 if no tokens have been
090: * returned.
091: *
092: */
093: private int lastLine = -1;
094:
095: /**
096: * Create a parser to parse comma separated values from an InputStream.
097: * <p>
098: * Byte to character conversion is done using the platform default locale.
099: *
100: * @param in
101: * stream that contains comma separated values.
102: *
103: */
104: public CSVParser(InputStream in) {
105: inStream = in;
106: lexer = new CSVLexer(in);
107: }
108:
109: /**
110: * Create a parser to parse delimited values from an InputStream.
111: * <p>
112: * Byte to character conversion is done using the platform default locale.
113: *
114: * @param in
115: * stream that contains comma separated values.
116: * @param delimiter
117: * record separator
118: *
119: * @throws BadDelimiterException
120: * if the specified delimiter cannot be used
121: *
122: */
123: public CSVParser(InputStream in, char delimiter) {
124: inStream = in;
125: lexer = new CSVLexer(in);
126: changeDelimiter(delimiter);
127: }
128:
129: /**
130: * Create a parser to parse comma separated values from a Reader.
131: *
132: * @param in
133: * reader that contains comma separated values.
134: *
135: */
136: public CSVParser(Reader in) {
137: inReader = in;
138: lexer = new CSVLexer(in);
139: }
140:
141: /**
142: * Create a parser to parse delimited values from a Reader.
143: *
144: * @param in
145: * reader that contains comma separated values.
146: * @param delimiter
147: * record separator
148: *
149: * @throws BadDelimiterException
150: * if the specified delimiter cannot be used
151: *
152: */
153: public CSVParser(Reader in, char delimiter) {
154: inReader = in;
155: lexer = new CSVLexer(in);
156: changeDelimiter(delimiter);
157: }
158:
159: /**
160: * Create a parser to parse delimited values from an InputStream.
161: * <p>
162: * Byte to character conversion is done using the platform default locale.
163: *
164: * @param in
165: * stream that contains comma separated values.
166: * @param escapes
167: * a list of characters that will represent escape sequences.
168: * @param replacements
169: * the list of replacement characters for those escape sequences.
170: * @param commentDelims
171: * list of characters a comment line may start with.
172: * @param delimiter
173: * record separator
174: *
175: * @throws BadDelimiterException
176: * if the specified delimiter cannot be used
177: *
178: */
179: public CSVParser(InputStream in, char delimiter, String escapes,
180: String replacements, String commentDelims) {
181: inStream = in;
182: lexer = new CSVLexer(in);
183: setEscapes(escapes, replacements);
184: setCommentStart(commentDelims);
185: changeDelimiter(delimiter);
186: }
187:
188: /**
189: * Create a parser to parse comma separated values from an InputStream.
190: * <p>
191: * Byte to character conversion is done using the platform default locale.
192: *
193: * @param in
194: * stream that contains comma separated values.
195: * @param escapes
196: * a list of characters that will represent escape sequences.
197: * @param replacements
198: * the list of replacement characters for those escape sequences.
199: * @param commentDelims
200: * list of characters a comment line may start with.
201: *
202: */
203: public CSVParser(InputStream in, String escapes,
204: String replacements, String commentDelims) {
205: inStream = in;
206: lexer = new CSVLexer(in);
207: setEscapes(escapes, replacements);
208: setCommentStart(commentDelims);
209: }
210:
211: /**
212: * Create a parser to parse delimited values from a Reader.
213: *
214: * @param in
215: * reader that contains comma separated values.
216: * @param escapes
217: * a list of characters that will represent escape sequences.
218: * @param replacements
219: * the list of replacement characters for those escape sequences.
220: * @param commentDelims
221: * list of characters a comment line may start with.
222: * @param delimiter
223: * record separator
224: *
225: * @throws BadDelimiterException
226: * if the specified delimiter cannot be used
227: *
228: */
229: public CSVParser(Reader in, char delimiter, String escapes,
230: String replacements, String commentDelims) {
231: inReader = in;
232: lexer = new CSVLexer(in);
233: setEscapes(escapes, replacements);
234: setCommentStart(commentDelims);
235: changeDelimiter(delimiter);
236: }
237:
238: /**
239: * Create a parser to parse comma separated values from a Reader.
240: *
241: * @param in
242: * reader that contains comma separated values.
243: * @param escapes
244: * a list of characters that will represent escape sequences.
245: * @param replacements
246: * the list of replacement characters for those escape sequences.
247: * @param commentDelims
248: * list of characters a comment line may start with.
249: *
250: */
251: public CSVParser(Reader in, String escapes, String replacements,
252: String commentDelims) {
253: inReader = in;
254: lexer = new CSVLexer(in);
255: setEscapes(escapes, replacements);
256: setCommentStart(commentDelims);
257: }
258:
259: /**
260: * Close any stream upon which this parser is based.
261: *
262: * @throws IOException
263: * if an error occurs while closing the stream.
264: */
265: public void close() throws IOException {
266: if (inStream != null) {
267: inStream.close();
268: }
269: if (inReader != null) {
270: inReader.close();
271: }
272: }
273:
274: /**
275: * get the next value.
276: *
277: * @return the next value or null if there are no more values.
278: * @throws IOException
279: * if an error occurs while reading.
280: *
281: */
282: public String nextValue() throws IOException {
283: if (tokenCache == null) {
284: tokenCache = lexer.getNextToken();
285: lineCache = lexer.getLineNumber();
286: }
287: lastLine = lineCache;
288: String result = tokenCache;
289: tokenCache = null;
290: return result;
291: }
292:
293: /**
294: * Get the line number that the last token came from.
295: * <p>
296: * New line breaks that occur in the middle of a token are no counted in the
297: * line number count.
298: *
299: * @return line number or -1 if no tokens have been returned yet.
300: *
301: */
302: public int lastLineNumber() {
303: return lastLine;
304: }
305:
306: /**
307: * Specify escape sequences and their replacements. Escape sequences set
308: * here are in addition to \\ and \". \\ and \" are always valid escape
309: * sequences. This method allows standard escape sequenced to be used. For
310: * example "\n" can be set to be a newline rather than an 'n'. A common way
311: * to call this method might be:<br>
312: * <code>setEscapes("nrtf", "\n\r\t\f");</code><br>
313: * which would set the escape sequences to be the Java escape sequences.
314: * Characters that follow a \ that are not escape sequences will still be
315: * interpreted as that character.<br>
316: * The two arguments to this method must be the same length. If they are
317: * not, the longer of the two will be truncated.
318: *
319: * @param escapes
320: * a list of characters that will represent escape sequences.
321: * @param replacements
322: * the list of replacement characters for those escape sequences.
323: *
324: */
325: public void setEscapes(String escapes, String replacements) {
326: lexer.setEscapes(escapes, replacements);
327: }
328:
329: /**
330: * Change this parser so that it uses a new delimiter.
331: * <p>
332: * The initial character is a comma, the delimiter cannot be changed to a
333: * quote or other character that has special meaning in CSV.
334: *
335: * @param newDelim
336: * delimiter to which to switch.
337: * @throws BadDelimiterException
338: * if the character cannot be used as a delimiter.
339: *
340: */
341: public void changeDelimiter(char newDelim) {
342: lexer.changeDelimiter(newDelim);
343: }
344:
345: /**
346: * Change this parser so that it uses a new character for quoting.
347: * <p>
348: * The initial character is a double quote ("), the delimiter cannot be
349: * changed to a comma or other character that has special meaning in CSV.
350: *
351: * @param newQuote
352: * character to use for quoting.
353: * @throws BadQuoteException
354: * if the character cannot be used as a quote.
355: *
356: */
357: public void changeQuote(char newQuote) {
358: lexer.changeQuote(newQuote);
359: }
360:
361: /**
362: * Set the characters that indicate a comment at the beginning of the line.
363: * For example if the string "#;!" were passed in, all of the following
364: * lines would be comments:<br>
365: *
366: * <pre>
367: * # Comment
368: * ; Another Comment
369: * ! Yet another comment
370: * </pre>
371: *
372: * By default there are no comments in CVS files. Commas and quotes may not
373: * be used to indicate comment lines.
374: *
375: * @param commentDelims
376: * list of characters a comment line may start with.
377: *
378: */
379: public void setCommentStart(String commentDelims) {
380: lexer.setCommentStart(commentDelims);
381: }
382:
383: /**
384: * Get the number of the line from which the last value was retrieved.
385: *
386: * @return line number or -1 if no tokens have been returned.
387: *
388: */
389: public int getLastLineNumber() {
390: return lastLine;
391: }
392: }
|