001: package org.apache.turbine.util.parser;
002:
003: /*
004: * Licensed to the Apache Software Foundation (ASF) under one
005: * or more contributor license agreements. See the NOTICE file
006: * distributed with this work for additional information
007: * regarding copyright ownership. The ASF licenses this file
008: * to you under the Apache License, Version 2.0 (the
009: * "License"); you may not use this file except in compliance
010: * with the License. You may obtain a copy of the License at
011: *
012: * http://www.apache.org/licenses/LICENSE-2.0
013: *
014: * Unless required by applicable law or agreed to in writing,
015: * software distributed under the License is distributed on an
016: * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017: * KIND, either express or implied. See the License for the
018: * specific language governing permissions and limitations
019: * under the License.
020: */
021:
022: import java.io.BufferedReader;
023: import java.io.IOException;
024: import java.io.InputStreamReader;
025: import java.io.Reader;
026: import java.io.StreamTokenizer;
027:
028: import java.util.ArrayList;
029: import java.util.Collections;
030: import java.util.Iterator;
031: import java.util.List;
032: import java.util.NoSuchElementException;
033:
034: import org.apache.commons.lang.exception.NestableRuntimeException;
035:
036: /**
037: * DataStreamParser is used to parse a stream with a fixed format and
038: * generate ValueParser objects which can be used to extract the values
039: * in the desired type.
040: *
041: * <p>The class itself is abstract - a concrete subclass which implements
042: * the initTokenizer method such as CSVParser or TSVParser is required
043: * to use the functionality.
044: *
045: * <p>The class implements the java.util.Iterator interface for convenience.
046: * This allows simple use in a Velocity template for example:
047: *
048: * <pre>
049: * #foreach ($row in $datastream)
050: * Name: $row.Name
051: * Description: $row.Description
052: * #end
053: * </pre>
054: *
055: * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
056: * @author <a href="mailto:martin@mvdb.net">Martin van den Bemt</a>
057: * @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a>
058: * @version $Id: DataStreamParser.java 534527 2007-05-02 16:10:59Z tv $
059: */
060: public abstract class DataStreamParser implements Iterator {
061: /**
062: * The constant for empty fields
063: */
064: protected static final String EMPTYFIELDNAME = "UNKNOWNFIELD";
065:
066: /**
067: * The list of column names.
068: */
069: private List columnNames = Collections.EMPTY_LIST;
070:
071: /**
072: * The stream tokenizer for reading values from the input reader.
073: */
074: private StreamTokenizer tokenizer;
075:
076: /**
077: * The parameter parser holding the values of columns for the current line.
078: */
079: private ValueParser lineValues;
080:
081: /**
082: * Indicates whether or not the tokenizer has read anything yet.
083: */
084: private boolean neverRead = true;
085:
086: /**
087: * The character encoding of the input
088: */
089: private String characterEncoding;
090:
091: /**
092: * The fieldseperator, which can be almost any char
093: */
094: private char fieldSeparator;
095:
096: /**
097: * Create a new DataStreamParser instance. Requires a Reader to read the
098: * comma-separated values from, a list of column names and a
099: * character encoding.
100: *
101: * @param in the input reader.
102: * @param columnNames a list of column names.
103: * @param characterEncoding the character encoding of the input.
104: */
105: public DataStreamParser(Reader in, List columnNames,
106: String characterEncoding) {
107: setColumnNames(columnNames);
108:
109: this .characterEncoding = characterEncoding;
110:
111: if (this .characterEncoding == null) {
112: if (in instanceof InputStreamReader) {
113: this .characterEncoding = ((InputStreamReader) in)
114: .getEncoding();
115: }
116:
117: if (this .characterEncoding == null) {
118: // try and get the characterEncoding from the reader
119: this .characterEncoding = "US-ASCII";
120: }
121: }
122:
123: tokenizer = new StreamTokenizer(new BufferedReader(in));
124: initTokenizer(tokenizer);
125: }
126:
127: /**
128: * Initialize the StreamTokenizer instance used to read the lines
129: * from the input reader. This must be implemented in subclasses to
130: * set up other tokenizing properties.
131: *
132: * @param tokenizer the tokenizer to adjust
133: */
134: protected void initTokenizer(StreamTokenizer tokenizer) {
135: tokenizer.resetSyntax();
136:
137: // leave out the comma sign (,), we need it for empty fields
138: tokenizer.wordChars(' ', Character.MAX_VALUE);
139:
140: // and set the quote mark as the quoting character
141: tokenizer.quoteChar('"');
142:
143: // and finally say that end of line is significant
144: tokenizer.eolIsSignificant(true);
145: }
146:
147: /**
148: * This method must be called to setup the field seperator
149: * @param fieldSeparator the char which separates the fields
150: */
151: public void setFieldSeparator(char fieldSeparator) {
152: this .fieldSeparator = fieldSeparator;
153: // make this field also an ordinary char by default.
154: tokenizer.ordinaryChar(fieldSeparator);
155: }
156:
157: /**
158: * Set the list of column names explicitly.
159: *
160: * @param columnNames A list of column names.
161: */
162: public void setColumnNames(List columnNames) {
163: if (columnNames != null) {
164: this .columnNames = columnNames;
165: }
166: }
167:
168: /**
169: * get the list of column names.
170: *
171: */
172: public List getColumnNames() {
173: return columnNames;
174: }
175:
176: /**
177: * Read the list of column names from the input reader using the
178: * tokenizer. If fieldNames are empty, we use the current fieldNumber
179: * + the EMPTYFIELDNAME to make one up.
180: *
181: * @exception IOException an IOException occurred.
182: */
183: public void readColumnNames() throws IOException {
184: List columnNames = new ArrayList();
185: int fieldCounter = 0;
186:
187: if (hasNextRow()) {
188: String colName = null;
189: boolean foundEol = false;
190:
191: while (!foundEol) {
192: tokenizer.nextToken();
193:
194: if (tokenizer.ttype == '"'
195: || tokenizer.ttype == StreamTokenizer.TT_WORD) {
196: // tokenizer.ttype is either '"' or TT_WORD
197: colName = tokenizer.sval;
198: } else {
199: // fieldSeparator, EOL or EOF
200: fieldCounter++;
201:
202: if (colName == null) {
203: colName = EMPTYFIELDNAME + fieldCounter;
204: }
205:
206: columnNames.add(colName);
207: colName = null;
208: }
209:
210: // EOL and EOF are checked independently from existing fields.
211: if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
212: foundEol = true;
213: } else if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
214: // Keep this token in the tokenizer for hasNext()
215: tokenizer.pushBack();
216: foundEol = true;
217: }
218: }
219:
220: setColumnNames(columnNames);
221: }
222: }
223:
224: /**
225: * Determine whether a further row of values exists in the input.
226: *
227: * @return true if the input has more rows.
228: * @exception IOException an IOException occurred.
229: */
230: public boolean hasNextRow() throws IOException {
231: // check for end of line ensures that an empty last line doesn't
232: // give a false positive for hasNextRow
233: if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL) {
234: tokenizer.nextToken();
235: tokenizer.pushBack();
236: neverRead = false;
237: }
238: return tokenizer.ttype != StreamTokenizer.TT_EOF;
239: }
240:
241: /**
242: * Returns a ValueParser object containing the next row of values.
243: *
244: * @return a ValueParser object.
245: * @exception IOException an IOException occurred.
246: * @exception NoSuchElementException there are no more rows in the input.
247: */
248: public ValueParser nextRow() throws IOException,
249: NoSuchElementException {
250: if (!hasNextRow()) {
251: throw new NoSuchElementException();
252: }
253:
254: if (lineValues == null) {
255: lineValues = new BaseValueParser(characterEncoding);
256: } else {
257: lineValues.clear();
258: }
259:
260: Iterator it = columnNames.iterator();
261:
262: String currVal = "";
263: String colName = null;
264:
265: boolean foundEol = false;
266: while (!foundEol || it.hasNext()) {
267: if (!foundEol) {
268: tokenizer.nextToken();
269: }
270:
271: if (colName == null && it.hasNext()) {
272: colName = String.valueOf(it.next());
273: }
274:
275: if (tokenizer.ttype == '"'
276: || tokenizer.ttype == StreamTokenizer.TT_WORD) {
277: // tokenizer.ttype is either '"' or TT_WORD
278: currVal = tokenizer.sval;
279: } else {
280: // fieldSeparator, EOL or EOF
281: lineValues.add(colName, currVal);
282: colName = null;
283: currVal = "";
284: }
285:
286: // EOL and EOF are checked independently from existing fields.
287: if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
288: foundEol = true;
289: } else if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
290: // Keep this token in the tokenizer for hasNext()
291: tokenizer.pushBack();
292: foundEol = true;
293: }
294: }
295:
296: return lineValues;
297: }
298:
299: /**
300: * Determine whether a further row of values exists in the input.
301: *
302: * @return true if the input has more rows.
303: */
304: public boolean hasNext() {
305: boolean hasNext = false;
306:
307: try {
308: hasNext = hasNextRow();
309: } catch (IOException e) {
310: throw new NestableRuntimeException(e);
311: }
312:
313: return hasNext;
314: }
315:
316: /**
317: * Returns a ValueParser object containing the next row of values.
318: *
319: * @return a ValueParser object as an Object.
320: * @exception NoSuchElementException there are no more rows in the input
321: * or an IOException occurred.
322: */
323: public Object next() throws NoSuchElementException {
324: Object nextRow = null;
325:
326: try {
327: nextRow = nextRow();
328: } catch (IOException e) {
329: throw new NestableRuntimeException(e);
330: }
331:
332: return nextRow;
333: }
334:
335: /**
336: * The optional Iterator.remove method is not supported.
337: *
338: * @exception UnsupportedOperationException the operation is not supported.
339: */
340: public void remove() throws UnsupportedOperationException {
341: throw new UnsupportedOperationException();
342: }
343: }
|