001: /**
002: * ========================================
003: * JFreeReport : a free Java report library
004: * ========================================
005: *
006: * Project Info: http://reporting.pentaho.org/
007: *
008: * (C) Copyright 2000-2007, by Object Refinery Limited, Pentaho Corporation and Contributors.
009: *
010: * This library is free software; you can redistribute it and/or modify it under the terms
011: * of the GNU Lesser General Public License as published by the Free Software Foundation;
012: * either version 2.1 of the License, or (at your option) any later version.
013: *
014: * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
015: * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
016: * See the GNU Lesser General Public License for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License along with this
019: * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
020: * Boston, MA 02111-1307, USA.
021: *
022: * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
023: * in the United States and other countries.]
024: *
025: * ------------
026: * $Id: CSVTokenizer.java 3525 2007-10-16 11:43:48Z tmorgner $
027: * ------------
028: * (C) Copyright 2000-2005, by Object Refinery Limited.
029: * (C) Copyright 2005-2007, by Pentaho Corporation.
030: */package org.jfree.report.util;
031:
032: import java.util.Enumeration;
033: import java.util.NoSuchElementException;
034:
035: /**
036: * The csv tokenizer class allows an application to break a Comma Separated Value format
037: * into tokens. The tokenization method is much simpler than the one used by the
038: * <code>StringTokenizer</code> class. The <code>CSVTokenizer</code> methods do not
039: * distinguish among identifiers, numbers, and quoted strings, nor do they recognize and
040: * skip comments.
041: * <p/>
042: * The set of separator (the characters that separate tokens) may be specified either at
043: * creation time or on a per-token basis.
044: * <p/>
045: * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on
046: * whether it was created with the <code>returnSeparators</code> flag having the value
047: * <code>true</code> or <code>false</code>: <ul> <li>If the flag is <code>false</code>,
048: * delimiter characters serve to separate tokens. A token is a maximal sequence of
049: * consecutive characters that are not separator. <li>If the flag is <code>true</code>,
050: * delimiter characters are themselves considered to be tokens. A token is thus either one
051: * delimiter character, or a maximal sequence of consecutive characters that are not
052: * separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current
053: * position within the string to be tokenized. Some operations advance this current
054: * position past the characters processed.<p> A token is returned by taking a substring of
055: * the string that was used to create the <tt>CSVTokenizer</tt> object.
056: * <p/>
057: * The following is one example of the use of the tokenizer. The code:
058: * <blockquote><pre>
059: * CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
060: * while (csvt.hasMoreTokens()) {
061: * println(csvt.nextToken());
062: * }
063: * </pre></blockquote>
064: * <p/>
065: * prints the following output:
066: * <blockquote><pre>
067: * this
068: * is
069: * a
070: * test
071: * </pre></blockquote>
072: *
073: * @author abupon
074: */
075: public class CSVTokenizer implements Enumeration {
076: /**
077: * The complete record that should be separated into elements.
078: */
079: private String record;
080: /**
081: * The separator.
082: */
083: private String separator;
084: /**
085: * The quoting char.
086: */
087: private String quate;
088:
089: /**
090: * the current parsing position.
091: */
092: private int currentIndex;
093:
094: private boolean beforeStart;
095:
096: /**
097: * A possible separator constant.
098: */
099: public static final String SEPARATOR_COMMA = ",";
100: /**
101: * A possible separator constant.
102: */
103: public static final String SEPARATOR_TAB = "\t";
104: /**
105: * A possible separator constant.
106: */
107: public static final String SEPARATOR_SPACE = " ";
108:
109: /**
110: * A possible quote character constant.
111: */
112: public static final String DOUBLE_QUATE = "\"";
113: /**
114: * A possible quote character constant.
115: */
116: public static final String SINGLE_QUATE = "'";
117:
118: /**
119: * Constructs a csv tokenizer for the specified string. <code>theSeparator</code>
120: * argument is the separator for separating tokens.
121: * <p/>
122: * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator
123: * string is also returned as tokens. separator is returned as a string. If the flag is
124: * <code>false</code>, the separator string is skipped and only serve as separator
125: * between tokens.
126: *
127: * @param aString a string to be parsed.
128: * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB,
129: * CSVTokenizer.SPACE, etc.).
130: * @param theQuate the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE,
131: * etc.).
132: */
133: public CSVTokenizer(final String aString,
134: final String theSeparator, final String theQuate) {
135: if (aString == null) {
136: throw new NullPointerException("The given string is null");
137: }
138: if (theSeparator == null) {
139: throw new NullPointerException(
140: "The given separator is null");
141: }
142: if (theQuate == null) {
143: throw new NullPointerException("The given quate is null");
144: }
145: this .record = aString.trim();
146: this .separator = theSeparator;
147: this .quate = theQuate;
148: this .currentIndex = 0;
149: this .beforeStart = true;
150: }
151:
152: /**
153: * Constructs a csv tokenizer for the specified string. The characters in the
154: * <code>theSeparator</code> argument are the separator for separating tokens. Separator
155: * string themselves will not be treated as tokens.
156: *
157: * @param aString a string to be parsed.
158: * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB,
159: * CSVTokenizer.SPACE, etc.).
160: */
161: public CSVTokenizer(final String aString, final String theSeparator) {
162: this (aString, theSeparator, CSVTokenizer.DOUBLE_QUATE);
163: }
164:
165: /**
166: * Constructs a string tokenizer for the specified string. The tokenizer uses the
167: * default separator set, which is <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator
168: * string themselves will not be treated as tokens.
169: *
170: * @param aString a string to be parsed.
171: */
172: public CSVTokenizer(final String aString) {
173: this (aString, CSVTokenizer.SEPARATOR_COMMA);
174: }
175:
176: /**
177: * Tests if there are more tokens available from this tokenizer's string. If this method
178: * returns <tt>true</tt>, then a subsequent call to <tt>nextToken</tt> with no argument
179: * will successfully return a token.
180: *
181: * @return <code>true</code> if and only if there is at least one token in the string
182: * after the current position; <code>false</code> otherwise.
183: */
184: public boolean hasMoreTokens() {
185: return (this .currentIndex < this .record.length());
186: }
187:
188: /**
189: * Returns the next token from this string tokenizer.
190: *
191: * @return the next token from this string tokenizer.
192: *
193: * @throws NoSuchElementException if there are no more tokens in this tokenizer's
194: * string.
195: * @throws IllegalArgumentException if given parameter string format was wrong
196: */
197: public String nextToken() throws NoSuchElementException,
198: IllegalArgumentException {
199:
200: if (!this .hasMoreTokens()) {
201: throw new NoSuchElementException();
202: }
203:
204: if (beforeStart == false) {
205: currentIndex += this .separator.length();
206: } else {
207: beforeStart = false;
208: }
209:
210: StringBuffer token = new StringBuffer();
211: if (this .record.startsWith(this .quate, this .currentIndex)) {
212: String rec = this .record.substring(this .currentIndex
213: + this .quate.length());
214: token.delete(0, token.length());
215: while (true) {
216: final int end = rec.indexOf(this .quate);
217: if (end < 0) {
218: throw new IllegalArgumentException("Illegal format");
219: }
220:
221: if (!rec.startsWith(this .quate, end + 1)) {
222: token.append(rec.substring(0, end));
223: break;
224: }
225: token.append(rec.substring(0, end + 1));
226: rec = rec.substring(end + this .quate.length() * 2);
227: this .currentIndex++;
228: }
229:
230: this .currentIndex += (token.length() + this .quate.length() * 2);
231: } else {
232: final int end = this .record.indexOf(this .separator,
233: this .currentIndex);
234: if (end >= 0) {
235: final int start = this .currentIndex;
236: token.delete(0, token.length());
237: token.append(this .record.substring(start, end));
238: this .currentIndex = end;
239: } else {
240: final int start = this .currentIndex;
241: token.delete(0, token.length());
242: token.append(this .record.substring(start));
243: this .currentIndex = this .record.length();
244: }
245: }
246:
247: return token.toString();
248: }
249:
250: /**
251: * Returns the next token in this string tokenizer's string. First, the set of
252: * characters considered to be separator by this <tt>CSVTokenizer</tt> object is changed
253: * to be the characters in the string <tt>separator</tt>. Then the next token in the
254: * string after the current position is returned. The current position is advanced
255: * beyond the recognized token. The new delimiter set remains the default after this
256: * call.
257: *
258: * @param theSeparator the new separator.
259: * @return the next token, after switching to the new delimiter set.
260: *
261: * @throws java.util.NoSuchElementException
262: * if there are no more tokens in this tokenizer's string.
263: */
264: public String nextToken(final String theSeparator) {
265: separator = theSeparator;
266: return nextToken();
267: }
268:
269: /**
270: * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that
271: * this class can implement the <code>Enumeration</code> interface.
272: *
273: * @return <code>true</code> if there are more tokens; <code>false</code> otherwise.
274: *
275: * @see java.util.Enumeration
276: * @see org.jfree.report.util.CSVTokenizer#hasMoreTokens()
277: */
278: public boolean hasMoreElements() {
279: return hasMoreTokens();
280: }
281:
282: /**
283: * Returns the same value as the <code>nextToken</code> method, except that its declared
284: * return value is <code>Object</code> rather than <code>String</code>. It exists so
285: * that this class can implement the <code>Enumeration</code> interface.
286: *
287: * @return the next token in the string.
288: *
289: * @throws java.util.NoSuchElementException
290: * if there are no more tokens in this tokenizer's string.
291: * @see java.util.Enumeration
292: * @see org.jfree.report.util.CSVTokenizer#nextToken()
293: */
294: public Object nextElement() {
295: return nextToken();
296: }
297:
298: /**
299: * Calculates the number of times that this tokenizer's <code>nextToken</code> method
300: * can be called before it generates an exception. The current position is not
301: * advanced.
302: *
303: * @return the number of tokens remaining in the string using the current delimiter
304: * set.
305: *
306: * @see org.jfree.report.util.CSVTokenizer#nextToken()
307: */
308: public int countTokens() {
309: int count = 0;
310:
311: final int preserve = this .currentIndex;
312: final boolean preserveStart = this .beforeStart;
313: while (this .hasMoreTokens()) {
314: this .nextToken();
315: count++;
316: }
317: this .currentIndex = preserve;
318: this .beforeStart = preserveStart;
319:
320: return count;
321: }
322:
323: /**
324: * Returns the quate.
325: *
326: * @return char
327: */
328: public String getQuate() {
329: return this .quate;
330: }
331:
332: /**
333: * Sets the quate.
334: *
335: * @param quate The quate to set
336: */
337: public void setQuate(final String quate) {
338: this.quate = quate;
339: }
340: }
|