The csv tokenizer class allows an application to break a Comma Separated Value format into tokens. : CSV File « Development Class


  

/**

 * 

 * JFreeReport : a free Java reporting library

 * 

 *

 * Project Info:  http://reporting.pentaho.org/

 *

 * (C) Copyright 2001-2007, by Object Refinery Ltd, Pentaho Corporation and Contributors.

 *

 * This library is free software; you can redistribute it and/or modify it under the terms

 * of the GNU Lesser General Public License as published by the Free Software Foundation;

 * either version 2.1 of the License, or (at your option) any later version.

 *

 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;

 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 * See the GNU Lesser General Public License for more details.

 *

 * You should have received a copy of the GNU Lesser General Public License along with this

 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,

 * Boston, MA 02111-1307, USA.

 *

 * [Java is a trademark or registered trademark of Sun Microsystems, Inc.

 * in the United States and other countries.]

 *

 * ------------

 * CSVTokenizer.java

 * ------------

 * (C) Copyright 2001-2007, by Object Refinery Ltd, Pentaho Corporation and Contributors.

 */



import java.util.Enumeration;

import java.util.NoSuchElementException;



/**

 * The csv tokenizer class allows an application to break a Comma Separated Value format into tokens. The tokenization

 * method is much simpler than the one used by the <code>StringTokenizer</code> class. The <code>CSVTokenizer</code>

 * methods do not distinguish among identifiers, numbers, and quoted strings, nor do they recognize and skip comments.

 * <p/>

 * The set of separator (the characters that separate tokens) may be specified either at creation time or on a per-token

 * basis.

 * <p/>

 * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on whether it was created with the

 * <code>returnSeparators</code> flag having the value <code>true</code> or <code>false</code>: <ul> <li>If the flag is

 * <code>false</code>, delimiter characters serve to separate tokens. A token is a maximal sequence of consecutive

 * characters that are not separator. <li>If the flag is <code>true</code>, delimiter characters are themselves

 * considered to be tokens. A token is thus either one delimiter character, or a maximal sequence of consecutive

 * characters that are not separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current position

 * within the string to be tokenized. Some operations advance this current position past the characters processed.<p> A

 * token is returned by taking a substring of the string that was used to create the <tt>CSVTokenizer</tt> object.

 * <p/>

 * The following is one example of the use of the tokenizer. The code:

 * <blockquote><pre>

 *     CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");

 *     while (csvt.hasMoreTokens()) {

 *         println(csvt.nextToken());

 *     }

 * </pre></blockquote>

 * <p/>

 * prints the following output:

 * <blockquote><pre>

 *     this

 *     is

 *     a

 *     test

 * </pre></blockquote>

 *

 * @author abupon

 */

public class CSVTokenizer implements Enumeration

{

  /**

   * The complete record that should be separated into elements.

   */

  private String record;

  /**

   * The separator.

   */

  private String separator;

  /**

   * The quoting char.

   */

  private String quate;



  /**

   * the current parsing position.

   */

  private int currentIndex;



  /**

   * A flag indicating that the current parse position is before the start.

   */

  private boolean beforeStart;



  /**

   * A possible separator constant.

   */

  public static final String SEPARATOR_COMMA = ",";

  /**

   * A possible separator constant.

   */

  public static final String SEPARATOR_TAB = "\t";

  /**

   * A possible separator constant.

   */

  public static final String SEPARATOR_SPACE = " ";



  /**

   * A possible quote character constant.

   */

  public static final String DOUBLE_QUATE = "\"";

  /**

   * A possible quote character constant.

   */

  public static final String SINGLE_QUATE = "'";



  /**

   * Constructs a csv tokenizer for the specified string. <code>theSeparator</code> argument is the separator for

   * separating tokens.

   * <p/>

   * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator string is also returned as

   * tokens. separator is returned as a string. If the flag is <code>false</code>, the separator string is skipped and

   * only serve as separator between tokens.

   *

   * @param aString      a string to be parsed.

   * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).

   * @param theQuate     the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).

   */

  public CSVTokenizer(final String aString, final String theSeparator,

                      final String theQuate)

  {

    if (aString == null)

    {

      throw new NullPointerException("The given string is null");

    }

    if (theSeparator == null)

    {

      throw new NullPointerException("The given separator is null");

    }

    if (theQuate == null)

    {

      throw new NullPointerException("The given quate is null");

    }

    this.record = aString.trim();

    this.separator = theSeparator;

    this.quate = theQuate;

    this.currentIndex = 0;

    this.beforeStart = true;

  }



  /**

   * Constructs a csv tokenizer for the specified string. The characters in the <code>theSeparator</code> argument are

   * the separator for separating tokens. Separator string themselves will not be treated as tokens.

   *

   * @param aString      a string to be parsed.

   * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).

   */

  public CSVTokenizer(final String aString, final String theSeparator)

  {

    this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE);

  }



  /**

   * Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is

   * <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator string themselves will not be treated as tokens.

   *

   * @param aString a string to be parsed.

   */

  public CSVTokenizer(final String aString)

  {

    this(aString, CSVTokenizer.SEPARATOR_COMMA);

  }



  /**

   * Tests if there are more tokens available from this tokenizer's string. If this method returns <tt>true</tt>, then a

   * subsequent call to <tt>nextToken</tt> with no argument will successfully return a token.

   *

   * @return <code>true</code> if and only if there is at least one token in the string after the current position;

   *         <code>false</code> otherwise.

   */

  public boolean hasMoreTokens()

  {

    return (this.currentIndex < this.record.length());

  }



  /**

   * Returns the next token from this string tokenizer.

   *

   * @return the next token from this string tokenizer.

   * @throws NoSuchElementException   if there are no more tokens in this tokenizer's string.

   * @throws IllegalArgumentException if given parameter string format was wrong

   */

  public String nextToken()

      throws NoSuchElementException, IllegalArgumentException

  {



    if (!this.hasMoreTokens())

    {

      throw new NoSuchElementException();

    }



    if (beforeStart == false)

    {

      currentIndex += this.separator.length();

    }

    else

    {

      beforeStart = false;

    }



    if (this.record.startsWith(this.quate, this.currentIndex))

    {

      final StringBuffer token = new StringBuffer();

      String rec = this.record.substring(this.currentIndex + this.quate.length());



      while (true)

      {

        final int end = rec.indexOf(this.quate);

        if (end < 0)

        {

          throw new IllegalArgumentException("Illegal format");

        }



        if (!rec.startsWith(this.quate, end + 1))

        {

          token.append(rec.substring(0, end));

          break;

        }

        token.append(rec.substring(0, end + 1));

        rec = rec.substring(end + this.quate.length() * 2);

        this.currentIndex++;

      }



      this.currentIndex += (token.length() + this.quate.length() * 2);

      return token.toString();

    }



    final int end = this.record.indexOf(this.separator, this.currentIndex);

    if (end >= 0)

    {

      final int start = this.currentIndex;

      final String token = this.record.substring(start, end);

      this.currentIndex = end;

      return token;

    }

    else

    {

      final int start = this.currentIndex;

      final String token = this.record.substring(start);

      this.currentIndex = this.record.length();

      return token;

    }

  }



  /**

   * Returns the next token in this string tokenizer's string. First, the set of characters considered to be separator

   * by this <tt>CSVTokenizer</tt> object is changed to be the characters in the string <tt>separator</tt>. Then the

   * next token in the string after the current position is returned. The current position is advanced beyond the

   * recognized token.  The new delimiter set remains the default after this call.

   *

   * @param theSeparator the new separator.

   * @return the next token, after switching to the new delimiter set.

   * @throws java.util.NoSuchElementException

   *          if there are no more tokens in this tokenizer's string.

   */

  public String nextToken(final String theSeparator)

  {

    separator = theSeparator;

    return nextToken();

  }



  /**

   * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that this class can implement the

   * <code>Enumeration</code> interface.

   *

   * @return <code>true</code> if there are more tokens; <code>false</code> otherwise.

   * @see java.util.Enumeration

   * @see org.jfree.report.util.CSVTokenizer#hasMoreTokens()

   */

  public boolean hasMoreElements()

  {

    return hasMoreTokens();

  }



  /**

   * Returns the same value as the <code>nextToken</code> method, except that its declared return value is

   * <code>Object</code> rather than <code>String</code>. It exists so that this class can implement the

   * <code>Enumeration</code> interface.

   *

   * @return the next token in the string.

   * @throws java.util.NoSuchElementException

   *          if there are no more tokens in this tokenizer's string.

   * @see java.util.Enumeration

   * @see org.jfree.report.util.CSVTokenizer#nextToken()

   */

  public Object nextElement()

  {

    return nextToken();

  }



  /**

   * Calculates the number of times that this tokenizer's <code>nextToken</code> method can be called before it

   * generates an exception. The current position is not advanced.

   *

   * @return the number of tokens remaining in the string using the current delimiter set.

   * @see org.jfree.report.util.CSVTokenizer#nextToken()

   */

  public int countTokens()

  {

    int count = 0;



    final int preserve = this.currentIndex;

    final boolean preserveStart = this.beforeStart;

    while (this.hasMoreTokens())

    {

      this.nextToken();

      count++;

    }

    this.currentIndex = preserve;

    this.beforeStart = preserveStart;



    return count;

  }



  /**

   * Returns the quate.

   *

   * @return char

   */

  public String getQuate()

  {

    return this.quate;

  }



  /**

   * Sets the quate.

   *

   * @param quate The quate to set

   */

  public void setQuate(final String quate)

  {

    this.quate = quate;

  }

}
The csv tokenizer class allows an application to break a Comma Separated Value format into tokens. : CSV File « Development Class « Java