001: /*
002: * Created on 23/10/2004
003: *
004: */
005: package com.quantum.csv.wizard;
006:
007: import java.io.BufferedReader;
008: import java.io.IOException;
009: import java.util.Vector;
010:
011: /**
012: * @author panic
013: *
014: */
015: public class CSVParser {
016: private String line;
017: private BufferedReader stream;
018: private char[] eol;
019: private StringBuffer actColumn = new StringBuffer();
020: private char columnSeparator;
021:
022: private static final byte CLEAN_STATE = 0;
023: private static final byte COMMENT_STATE = 1;
024: private static final byte QUOTED_STATE = 2;
025: private static final byte IN_QUOTE_STATE = 3;
026: private static final byte CHARS_STATE = 4;
027:
028: ;
029:
030: CSVParser(BufferedReader stream, String eol, char columnSeparator)
031: throws IOException {
032: this .stream = stream;
033: this .eol = eol.toCharArray();
034: this .columnSeparator = columnSeparator;
035: }
036:
037: /** Parses the stream into columns.
038: * @param columns The parsed columns, as a Vector of String
039: * @return 0 if still more rows, 1 if not
040: * @throws IOException
041: */
042: int parse(Vector columns, Vector emptyStrings) throws IOException {
043: if (eol.length < 1)
044: throw new IOException("No End-Of-Line defined");
045: //Initial state
046: byte state = CLEAN_STATE;
047: // Integer read from the stream
048: int ic = 0;
049: // character read from the stream
050: char c;
051: boolean endOfStream = false;
052: columns.clear(); // In case it's not empty
053: emptyStrings.clear();
054: while (ic >= 0) {
055: ic = stream.read();
056: // This endOfStream flag will only be true when the stream is at end,
057: //unlike ic < 0 that will be true also when the line is at end
058: endOfStream = (ic < 0);
059: c = (char) ic;
060:
061: switch (state) {
062: case CLEAN_STATE:
063: if (c == '"') {
064: state = QUOTED_STATE;
065: } else if (c == columnSeparator) {
066: addColumn(columns, emptyStrings, new Boolean(false));
067: } else if (c == '#') {
068: state = COMMENT_STATE;
069: } else {
070: state = CHARS_STATE;
071: actColumn.append(c);
072: }
073: break;
074: case COMMENT_STATE:
075: break;
076: case QUOTED_STATE:
077: if (c == '"') {
078: state = IN_QUOTE_STATE;
079: actColumn.append(c);
080: } else {
081: actColumn.append(c);
082: }
083: break;
084: case IN_QUOTE_STATE:
085: if (c == '"') {
086: state = QUOTED_STATE;
087: } else if (c == columnSeparator) {
088: state = CLEAN_STATE;
089: if (actColumn.length() > 0)
090: actColumn.setLength(actColumn.length() - 1);
091: // In this case we consider it to be an empty string, and mark it
092: // as such by using a double quoted empty string, instead of no value
093: addColumn(columns, emptyStrings, new Boolean(true));
094: } else if (c == '#') {
095: state = COMMENT_STATE;
096: addColumn(columns, emptyStrings, new Boolean(false));
097: } else {
098: throw new IOException("CSV Format Error :" + line);
099: }
100: break;
101: case CHARS_STATE:
102: if (c == '#') {
103: state = COMMENT_STATE;
104: addColumn(columns, emptyStrings, new Boolean(false));
105: } else if (c == columnSeparator) {
106: state = CLEAN_STATE;
107: addColumn(columns, emptyStrings, new Boolean(false));
108: } else {
109: actColumn.append(c);
110: }
111: break;
112: default:
113: break;
114: }
115: // If we are inside a quote, all characters are created equal, if not,
116: // we keep searching the end of line
117: if (state != QUOTED_STATE) {
118: if (c == eol[0])
119: if (atEOL()) {
120: // Cut the last added character to the buffer if needed
121: if (actColumn.charAt(actColumn.length() - 1) == eol[0])
122: actColumn.setLength(actColumn.length() - 1);
123: // Sets the ic to -1 to end the while loop
124: ic = -1;
125: }
126: }
127:
128: }
129: if (state != COMMENT_STATE)
130: addColumn(columns, emptyStrings, new Boolean(false));
131:
132: return (endOfStream) ? 1 : 0;
133: }
134:
135: /**
136: * @param columns
137: * @param buffer
138: */
139: private void addColumn(Vector columns, Vector nulls, Boolean isNull) {
140: columns.add(actColumn.toString());
141: nulls.add(isNull);
142: actColumn.setLength(0);
143: }
144:
145: /**
146: * @return true if the stream is at the defined End Of Line character, false if not
147: * @throws IOException
148: */
149: private boolean atEOL() throws IOException {
150: // Mark the point in the stream, to go back if it's not an EOL after all
151: stream.mark(eol.length);
152: boolean isEOL = true;
153: // Start checking in the second character f. If there is only a character
154: // will be true, because the first character is already found.
155: for (int i = 1; i < eol.length; i++) {
156: int c = stream.read();
157: // If it's the end of the stream it will always be false, and will reset.
158: if (c != eol[i]) {
159: isEOL = false;
160: stream.reset();
161: break;
162: }
163: }
164: return isEOL;
165: }
166:
167: }
|