001: package jimm.datavision.source.charsep;
002:
003: import java.io.Reader;
004: import java.io.IOException;
005: import java.util.List;
006: import java.util.ArrayList;
007:
008: /**
009: * Parses delimited data. Handles quotes and embedded delimiters.
010: *
011: * @author Jim Menard, <a href="mailto:jimm@io.com">jimm@io.com</a>
012: */
013: public class DelimParser {
014:
015: public static final int EOF = -1;
016:
017: protected char delimiter;
018: protected Reader in;
019: protected int pushbackChar;
020:
021: /**
022: * Constructor, using ',' as the delimiter. The caller must close
023: * <var>in</var>.
024: *
025: * @param in input reader
026: */
027: public DelimParser(Reader in) {
028: this (in, ',');
029: }
030:
031: /**
032: * Constructor. The caller must close <var>in</var>.
033: *
034: * @param in input reader
035: * @param delimiter delimiter character
036: */
037: public DelimParser(Reader in, char delimiter) {
038: this .delimiter = delimiter;
039: this .in = in;
040: pushbackChar = EOF;
041: }
042:
043: /**
044: * Returns an array of column data or <code>null</code> if there is no more
045: * data. Handles delimiters and quotes within the data just as they are
046: * generated by Excel comma- and tab-separated files.
047: *
048: * @return a <code>List</code> of strings; return <code>null</code> if
049: * there is no more data.
050: */
051: public List parse() throws IOException {
052: ArrayList columns = null;
053: boolean insideQuotes = false;
054: int numQuotesSeen = 0;
055: StringBuffer buf = new StringBuffer();
056:
057: int charAsInt;
058: char c;
059: char prevChar = '\0';
060: while ((charAsInt = nextChar()) != EOF) {
061: c = (char) charAsInt;
062:
063: switch (c) {
064: case '"': // Quote character
065: if (!insideQuotes) { // Start of quoted column
066: insideQuotes = true;
067: numQuotesSeen = 0;
068: } else if (insideQuotes) { // Inside quoted column
069: if (numQuotesSeen == 1) { // This is second of doubled quotes
070: buf.append(c);
071: numQuotesSeen = 0;
072: } else
073: numQuotesSeen = 1;
074: }
075: break;
076: case '\n': // Linefeed/newline
077: case '\r':
078: if (insideQuotes) {
079: if (numQuotesSeen == 1) { // Closing quote at end of line
080: if (columns == null)
081: columns = new ArrayList();
082: columns.add(buf.toString());
083: return columns;
084: } else
085: buf.append(c);
086: } else { // End of line; return columns
087: // Handle DOS line endings
088: if (c == '\r') { // Check for following '\n
089: charAsInt = nextChar();
090: c = (char) charAsInt;
091: if (c != '\n') // Eat following '\n' if it exists
092: pushback(charAsInt); // Else put it back
093: }
094:
095: charAsInt = nextChar();
096: c = (char) charAsInt;
097: if (columns == null && buf.length() == 0
098: && charAsInt == EOF)
099: return null; // Empty line at end of file
100:
101: pushback(charAsInt);
102: if (columns == null)
103: columns = new ArrayList();
104: columns.add(buf.toString());
105: return columns;
106: }
107: break;
108: default:
109: if (c == delimiter) { // Normal delimiter
110: if (!insideQuotes) {
111: if (columns == null)
112: columns = new ArrayList();
113: columns.add(buf.toString());
114: buf = new StringBuffer();
115: } else { // Inside quoted column
116: // Delimiter at end of quoted column data
117: if (numQuotesSeen == 1) {
118: insideQuotes = false;
119: if (columns == null)
120: columns = new ArrayList();
121: columns.add(buf.toString());
122: buf = new StringBuffer();
123: }
124: // Delimiter inside quoted column
125: else
126: buf.append(delimiter);
127: }
128: } else { // Everything else
129: numQuotesSeen = 0;
130: buf.append(c);
131: }
132: break;
133: }
134:
135: prevChar = c;
136: }
137:
138: // We've reached EOF
139: if (columns == null && buf.length() == 0) // Empty line at end of file
140: return null;
141:
142: if (buf.length() > 0 || prevChar == delimiter) {
143: if (columns == null)
144: columns = new ArrayList();
145: columns.add(buf.toString());
146: }
147: return columns;
148: }
149:
150: protected int nextChar() throws IOException {
151: if (pushbackChar == EOF)
152: return in.read();
153: else {
154: int c = pushbackChar;
155: pushbackChar = EOF;
156: return c;
157: }
158: }
159:
160: protected void pushback(int charAsInt) {
161: pushbackChar = charAsInt;
162: }
163:
164: }
|