StreamTokenizer.cs : » GIS » DeepEarth » RTools_NTS » Util » C# / CSharp Open Source

1.	2.6.4 mono .net core
2.	2.6.4 mono core
3.	Aspect Oriented Frameworks
4.	Bloggers
5.	Build Systems
6.	Business Application
7.	Charting Reporting Tools
8.	Chat Servers
9.	Code Coverage Tools
10.	Content Management Systems CMS
11.	CRM ERP
12.	Database
13.	Development
14.	Email
15.	Forum
16.	Game
17.	GIS
18.	GUI
19.	IDEs
20.	Installers Generators
21.	Inversion of Control Dependency Injection
22.	Issue Tracking
23.	Logging Tools
24.	Message
25.	Mobile
26.	Network Clients
27.	Network Servers
28.	Office
29.	PDF
30.	Persistence Frameworks
31.	Portals
32.	Profilers
33.	Project Management
34.	RSS RDF
35.	Rule Engines
36.	Script
37.	Search Engines
38.	Sound Audio
39.	Source Control
40.	SQL Clients
41.	Template Engines
42.	Testing
43.	UML
44.	Web Frameworks
45.	Web Service
46.	Web Testing
47.	Wiki Engines
48.	Windows Presentation Foundation
49.	Workflows
50.	XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » GIS » DeepEarth
DeepEarth » RTools_NTS » Util » StreamTokenizer.cs
// StreamTokenizer.cs
// 
// Copyright (C) 2002-2004 Ryan Seghers
//
// This software is provided AS IS. No warranty is granted, 
// neither expressed nor implied. USE THIS SOFTWARE AT YOUR OWN RISK.
// NO REPRESENTATION OF MERCHANTABILITY or FITNESS FOR ANY 
// PURPOSE is given.
//
// License to use this software is limited by the following terms:
// 1) This code may be used in any program, including programs developed
//    for commercial purposes, provided that this notice is included verbatim.
//    
// Also, in return for using this code, please attempt to make your fixes and
// updates available in some way, such as by sending your updates to the
// author.
//
// To-do:
//    make type exclusivity explict, and enforce:
//      digit can be word
//      word can't be whitespace
//      etc.
//      large-integer handling is imprecise, fix it
//          (most 19 digit decimal numbers can be Int64's but I'm 
//           using float anyway)
//    add more mangled float test cases
//
// Later:
//    reconfigurable vs Unicode support
//    add NUnit wrap of built-in tests

using System;
using System.Collections;
using System.IO;
using System.Text;
using System.Collections.Generic;

namespace RTools_NTS.Util{
  // ---------------------------------------------------------------------
  #region Exceptions
  // ---------------------------------------------------------------------

  /// <summary>
  /// Exception class for unterminated tokens.
  /// </summary>
  public class StreamTokenizerUntermException : Exception
  {
    /// <summary>
    /// Construct with a particular message.
    /// </summary>
    /// <param name="msg">The message to store in this object.</param>
    public StreamTokenizerUntermException(string msg) : base(msg) {}
  }

  /// <summary>
  /// Exception class for unterminated quotes.
  /// </summary>
  public class StreamTokenizerUntermQuoteException : StreamTokenizerUntermException
  {
    /// <summary>
    /// Construct with a particular message.
    /// </summary>
    /// <param name="msg">The message to store in this object.</param>
    public StreamTokenizerUntermQuoteException(string msg) : base(msg) {}
  }

  /// <summary>
  /// Exception class for unterminated block comments.
  /// </summary>
  public class StreamTokenizerUntermCommentException : StreamTokenizerUntermException
  {
    /// <summary>
    /// Construct with a particular message.
    /// </summary>
    /// <param name="msg">The message to store in this object.</param>
    public StreamTokenizerUntermCommentException(string msg) : base(msg) {}
  }

  #endregion

  // ---------------------------------------------------------------------
  #region Enumerations
  // ---------------------------------------------------------------------

  /// <summary>
  /// Bitwise enumeration for character types.
  /// </summary>
  [Flags]
  public enum CharTypeBits : byte
  {
    /// <summary>word characters (usually alpha, digits, and domain specific)</summary>
    Word = 1,
    /// <summary># or something for line comments</summary>
    Comment = 2,
    /// <summary>whitespace</summary>
    Whitespace = 4,
    /// <summary>' or " type</summary>
    Quote = 8,
    /// <summary>usually 0 to 9</summary>
    Digit = 16,
    /// <summary>usually 0 to 9, a-f and A-F</summary>
    HexDigit = 32,
    /// <summary>eof char</summary>
    Eof = 64
  }

  #endregion

  /// <summary>
  /// This contains the settings that control the behavior of the tokenizer.
  /// This is separated from the StreamTokenizer so that common settings
  /// are easy to package and keep together.
  /// </summary>
#if !SILVERLIGHT
  [Serializable]
#endif
  public class StreamTokenizerSettings
  {
    // ---------------------------------------------------------------------
    #region Properties
    // ---------------------------------------------------------------------

    private byte[] charTypes;
    /// <summary>
    /// This is the character type table.  Each byte is bitwise encoded
    /// with the character attributes, such as whether that character is
    /// word or whitespace.
    /// </summary>
    public byte[] CharTypes { get { return(charTypes); } }

    bool grabWhitespace;
    /// <summary>
    /// Whether or not to return whitespace tokens.  If not, they're ignored.
    /// </summary>
    public bool GrabWhitespace { get { return(grabWhitespace); } set { grabWhitespace = value; } }

    bool grabEol;
    /// <summary>
    /// Whether or not to return EolTokens on end of line.  Eol tokens will not
    /// break up other tokens which can be multi-line.  For example block comments 
    /// and quotes will not be broken by Eol tokens.  Therefore the number of
    /// Eol tokens does not give you the line count of a stream.
    /// </summary>
    public bool GrabEol { get { return(grabEol); } set { grabEol = value; } }

    bool slashSlashComments;
    /// <summary>
    /// Whether or not to look for // comments
    /// </summary>
    public bool SlashSlashComments { get { return(slashSlashComments); } set { slashSlashComments = value; } }

    bool slashStarComments;
    /// <summary>
    /// Whether or not to look for /* */ block comments.
    /// </summary>
    public bool SlashStarComments { get { return(slashStarComments); } set { slashStarComments = value; } }

    bool grabComments;
    /// <summary>
    /// Whether or not to return comments.
    /// </summary>
    public bool GrabComments { get { return(grabComments); } set { grabComments = value; } }

    bool doUntermCheck;
    /// <summary>
    /// Whether or not to check for unterminated quotes and block comments.
    /// If true, and one is encoutered, an exception is thrown of the appropriate type.
    /// </summary>
    public bool DoUntermCheck { get { return(doUntermCheck); } set { doUntermCheck = value; } }

    bool parseNumbers;
    /// <summary>
    /// Whether or not digits are specified as Digit type in the
    /// character table.
    /// This setting is based on the character types table, so this
    /// setting interacts with character type table manipulation.
    /// This setting may become incorrect if you modify the character
    /// types table directly.
    /// </summary>
    public bool ParseNumbers
    {
      get { return(parseNumbers); }
/* dropped for speed, this means this property isn't accurate if
 * character types table is modified directly.
 *       { 
        for (int i = '0'; i <= '9'; i++)
        {
          if (!IsCharType((char)i, CharTypeBits.Digit)) 
          {
            return(false);
          }
        }

        return(true); 
      }
*/
      set 
      {
        if (value)
        {
          for (int i = '0'; i <= '9'; i++) 
            charTypes[i] |= (byte)CharTypeBits.Digit;
        }
        else
        {
          byte digit = (byte)CharTypeBits.Digit;

          for (int i = '0'; i <= '9'; i++) 
          {
            charTypes[i] &= (byte)(~digit); // not digit
          }
        }
        parseNumbers = value;
      }
    }

    bool parseHexNumbers;

    /// <summary>
    /// Whether or not to parse Hex (0xABCD...) numbers.
    /// This setting is based on the character types table, so this
    /// setting interacts with character type table manipulation.
    /// </summary>
    public bool ParseHexNumbers
    {
      get 
      { 
        return(parseHexNumbers);
//        for (int i = 'A'; i <= 'F'; i++)
//        {
//          if (!IsCharType((char)i, CharTypeBits.Digit)) 
//          {
//            return(false);
//          }
//        }
//        for (int i = 'a'; i <= 'f'; i++)
//        {
//          if (!IsCharType((char)i, CharTypeBits.Digit)) 
//          {
//            return(false);
//          }
//        }
//        if (!IsCharType('x', CharTypeBits.Digit)) return(false);
//
//        return(true); 
      }
      set 
      {
        parseHexNumbers = value;
        if (parseHexNumbers)
        {
          for (int i = '0'; i <= '9'; i++)
            charTypes[i] |= (byte)CharTypeBits.HexDigit;
          for (int i = 'A'; i <= 'F'; i++) 
            charTypes[i] |= (byte)CharTypeBits.HexDigit;
          for (int i = 'a'; i <= 'f'; i++) 
            charTypes[i] |= (byte)CharTypeBits.HexDigit;
          charTypes[(int)'x'] |= (byte)CharTypeBits.HexDigit;
        }
        else
        {
          byte digit = (byte)CharTypeBits.HexDigit;

          for (int i = 'A'; i <= 'F'; i++) 
          {
            charTypes[i] &= (byte)(~digit); // not digit
          }
          for (int i = 'a'; i <= 'f'; i++) 
          {
            charTypes[i] &= (byte)(~digit); // not digit
          }
          charTypes[(int)'x'] &= (byte)(~digit);
        }
      }
    }

    #endregion

    // ---------------------------------------------------------------------
    #region Constructors/Destructor
    // ---------------------------------------------------------------------

    /// <summary>
    /// Default constructor.
    /// </summary>
    public StreamTokenizerSettings()
    {
      charTypes = new byte[StreamTokenizer.NChars + 1];  // plus an EOF entry
      SetDefaults();
    }

    /// <summary>
    /// Copy constructor.
    /// </summary>
    public StreamTokenizerSettings(StreamTokenizerSettings other)
    {
      Copy(other);
    }

    /// <summary>
    /// Sets this object to be the same as the specified object.
    /// Note that some settings which are entirely embodied by the character
    /// type table.
    /// </summary>
    public void Copy(StreamTokenizerSettings other)
    {
      charTypes = new byte[StreamTokenizer.NChars + 1];  // plus an EOF entry
      Array.Copy(other.charTypes, 0, charTypes, 0, charTypes.Length);
      
      grabWhitespace = other.grabWhitespace;
      grabEol = other.grabEol;
      slashSlashComments = other.slashSlashComments;
      slashStarComments = other.slashStarComments;
      grabComments = other.grabComments;
      doUntermCheck = other.doUntermCheck;

      parseHexNumbers = other.parseHexNumbers;
    }

    #endregion

    // ---------------------------------------------------------------------
    #region main Setup
    // ---------------------------------------------------------------------

    /// <summary>
    /// Setup default parse behavior.
    /// This resets to same behavior as on construction.
    /// </summary>
    /// <returns>bool - true for success.</returns>
    public bool SetDefaults()
    {
      slashStarComments = false;
      grabComments = false;
      slashSlashComments = false;
      grabWhitespace = false;
      doUntermCheck = true;
      grabEol = false;

      // setup table
      ResetCharTypeTable();
      ParseNumbers = true;
      ParseHexNumbers = true;
      WordChars('A', 'Z');
      WordChars('a', 'z');
      WhitespaceChars(0, ' ');
      QuoteChar('\'');
      QuoteChar('"');
      WordChars('0', '9');

      return(true);
    }

    /// <summary>
    /// Apply settings which are commonly used for code parsing
    /// C-endCapStyle code, including C++, C#, and Java.
    /// </summary>
    /// <returns></returns>
    public bool SetupForCodeParse()
    {
      GrabWhitespace = true;
      GrabComments = true;
      SlashSlashComments = true;
      DoUntermCheck = true;
      SlashStarComments = true;
      WordChar('_');
      ParseNumbers = true;
      ParseHexNumbers = true;
      return(true);
    }

    #endregion

    // ---------------------------------------------------------------------
    #region Character Table Setup
    // ---------------------------------------------------------------------

    /// <summary>
    /// Clear the character type settings.  This leaves them unset,
    /// as opposed to the default.  Use SetDefaults() for default
    /// settings.
    /// </summary>
    public void ResetCharTypeTable() 
    {
      Array.Clear(charTypes, 0, charTypes.Length);
      charTypes[StreamTokenizer.NChars] = (byte)CharTypeBits.Eof; // last entry for Eof
    }

    /// <summary>
    /// Specify that a particular character is a word character.
    /// Character table type manipulation method.
    /// This adds the type to the char(s), rather
    /// than overwriting other types.
    /// </summary>
    /// <param name="c">The character.</param>
    public void WordChar(int c) 
    {
      charTypes[c] |= (byte)CharTypeBits.Word;
    }

    /// <summary>
    /// Specify that a range of characters are word characters.
    /// Character table type manipulation method.
    /// This adds the type to the char(s), rather
    /// than overwriting other types.
    /// </summary>
    /// <param name="startChar">First character.</param>
    /// <param name="endChar">Last character.</param>
    public void WordChars(int startChar, int endChar) 
    {
      for (int i = startChar; i <= endChar; i++)
      {
        charTypes[i] |= (byte)CharTypeBits.Word;
      }
    }

    /// <summary>
    /// Specify that a string of characters are word characters.
    /// Character table type manipulation method.
    /// This adds the type to the char(s), rather
    /// than overwriting other types.
    /// </summary>
    /// <param name="s"></param>
    public void WordChars(string s)
    {
      for (int i = 0; i < s.Length; i++)
        charTypes[s[i]] |= (byte)CharTypeBits.Word;
    }

    /// <summary>
    /// Specify that a character is a whitespace character.
    /// Character table type manipulation method.
    /// This type is exclusive with other types.
    /// </summary>
    /// <param name="c">The character.</param>
    public void WhitespaceChar(int c) 
    {
      charTypes[c] = (byte)CharTypeBits.Whitespace;
    }

    /// <summary>
    /// Specify that a range of characters are whitespace characters.
    /// Character table type manipulation method.
    /// This adds the characteristic to the char(s), rather
    /// than overwriting other characteristics.
    /// </summary>
    /// <param name="startChar">First character.</param>
    /// <param name="endChar">Last character.</param>
    public void WhitespaceChars(int startChar, int endChar) 
    {
      for (int i = startChar; i <= endChar; i++)
        charTypes[i] = (byte)CharTypeBits.Whitespace;
    }

    /// <summary>
    /// Remove other type settings from a range of characters.
    /// Character table type manipulation method.
    /// </summary>
    /// <param name="startChar"></param>
    /// <param name="endChar"></param>
    public void OrdinaryChars(int startChar, int endChar) 
    {
      for (int i = startChar; i <= endChar; i++)
        charTypes[i] = 0;
    }

    /// <summary>
    /// Remove other type settings from a character.
    /// Character table type manipulation method.
    /// </summary>
    /// <param name="c"></param>
    public void OrdinaryChar(int c) 
    {
      charTypes[c] = 0;
    }

    /// <summary>
    /// Specify that a particular character is a comment-starting character.
    /// Character table type manipulation method.
    /// </summary>
    /// <param name="c"></param>
    public void CommentChar(int c) 
    {
      charTypes[c] = (byte)CharTypeBits.Comment;
    }

    /// <summary>
    /// Specify that a particular character is a quote character.
    /// Character table type manipulation method.
    /// </summary>
    /// <param name="c"></param>
    public void QuoteChar(int c) 
    {
      charTypes[c] = (byte)CharTypeBits.Quote;
    }

    #endregion

    // ---------------------------------------------------------------------
    #region Utility Methods
    // ---------------------------------------------------------------------

    /// <summary>
    /// Return a string representation of a character type setting.
    /// Since the type setting is bitwise encoded, a character
    /// can have more than one type.
    /// </summary>
    /// <param name="ctype">The character type byte.</param>
    /// <returns>The string representation of the type flags.</returns>
    public string CharTypeToString(byte ctype)
    {
      StringBuilder str = new StringBuilder();

      if (IsCharType(ctype, CharTypeBits.Quote)) str.Append('q');
      if (IsCharType(ctype, CharTypeBits.Comment)) str.Append('m');
      if (IsCharType(ctype, CharTypeBits.Whitespace)) str.Append('w');
      if (IsCharType(ctype, CharTypeBits.Digit)) str.Append('d');
      if (IsCharType(ctype, CharTypeBits.Word)) str.Append('a');
      if (IsCharType(ctype, CharTypeBits.Eof)) str.Append('e');
      if (str.Length == 0)
      {
        str.Append('c');
      }
      return(str.ToString());
    }

    /// <summary>
    /// Check whether the specified char type byte has a 
    /// particular type flag set.
    /// </summary>
    /// <param name="ctype">The char type byte.</param>
    /// <param name="type">The CharTypeBits entry to compare to.</param>
    /// <returns>bool - true or false</returns>
    public bool IsCharType(byte ctype, CharTypeBits type)
    {
      return((ctype & (byte)type) != 0);
    }

    /// <summary>
    /// Check whether the specified char has a 
    /// particular type flag set.
    /// </summary>
    /// <param name="c">The character.</param>
    /// <param name="type">The CharTypeBits entry to compare to.</param>
    /// <returns>bool - true or false</returns>
    public bool IsCharType(char c, CharTypeBits type)
    {
      return((charTypes[c] & (byte)type) != 0);
    }

    /// <summary>
    /// Check whether the specified char has a 
    /// particular type flag set.
    /// </summary>
    /// <param name="c">The character.</param>
    /// <param name="type">The CharTypeBits entry to compare to.</param>
    /// <returns>bool - true or false</returns>
    public bool IsCharType(int c, CharTypeBits type)
    {
      return((charTypes[c] & (byte)type) != 0);
    }

    #endregion

    // ---------------------------------------------------------------------
    #region Standard Methods
    // ---------------------------------------------------------------------

    /// <summary>
    /// Display the state of this object.
    /// </summary>
    public void Display()
    {
      Display(String.Empty);
    }

    /// <summary>
    /// Display the state of this object, with a per-line prefix.
    /// </summary>
    /// <param name="prefix">The pre-line prefix.</param>
    public void Display(string prefix)
    {      
    }
    #endregion
  }

  /// <summary>
  /// A StreamTokenizer similar to Java's.  This breaks an input stream
  /// (coming from a TextReader) into Tokens based on various settings.  The settings
  /// are stored in the TokenizerSettings property, which is a
  /// StreamTokenizerSettings instance.
  /// </summary>
  /// <remarks>
  /// <para>
  /// This is configurable in that you can modify TokenizerSettings.CharTypes[] array
  /// to specify which characters are which type, along with other settings
  /// such as whether to look for comments or not.
  /// </para>
  /// <para>
  /// WARNING: This is not internationalized.  This treats all characters beyond
  /// the 7-bit ASCII range (decimal 127) as Word characters.
  /// </para>
  /// <para>
  /// There are two main ways to use this: 1) Parse the entire stream at
  /// once and get an ArrayList of Tokens (see the Tokenize* methods), 
  /// and 2) call NextToken() successively.
  /// This reads from a TextReader, which you can set directly, and this
  /// also provides some convenient methods to parse files and strings.
  /// This returns an Eof token if the end of the input is reached.
  /// </para>
  /// <para>
  /// Here's an example of the NextToken() endCapStyle of use:
  /// <code>
  /// StreamTokenizer tokenizer = new StreamTokenizer();
  /// tokenizer.GrabWhitespace = true;
  /// tokenizer.Verbosity = VerbosityLevel.Debug; // just for debugging
  /// tokenizer.TextReader = File.OpenText(fileName);
  /// Token token;
  /// while (tokenizer.NextToken(out token)) log.Info("Token = '{0}'", token);
  /// </code>
  /// </para>
  /// <para>
  /// Here's an example of the Tokenize... endCapStyle of use:
  /// <code>
  /// StreamTokenizer tokenizer = new StreamTokenizer("some string");
  /// ArrayList tokens = new ArrayList();
  /// if (!tokenizer.Tokenize(tokens)) 
  /// { 
  ///    // error handling
  /// }
  /// foreach (Token t in tokens) Console.WriteLine("t = {0}", t);
  /// </code>
  /// </para>
  /// <para>
  /// Comment delimiters are hardcoded (// and /*), not affected by char type table.
  /// </para>
  /// <para>
  /// This sets line numbers in the tokens it produces.  These numbers are normally
  /// the line on which the token starts.
  /// There is one known caveat, and that is that when GrabWhitespace setting
  /// is true, and a whitespace token contains a newline, that token's line number
  /// will be set to the following line rather than the line on which the token
  /// started.
  /// </para>
  /// </remarks>
  public class StreamTokenizer
  {
    // ----------------------------------------------------------------
    #region Constants
    // ----------------------------------------------------------------

    /// <summary>
    /// This is the number of characters in the character table.
    /// </summary>
    public static readonly int NChars = 128;
    private static readonly int Eof = NChars;
    #endregion

    // ----------------------------------------------------------------
    #region Private Fields
    // ----------------------------------------------------------------

    // A class for verbosity/message handling
    private Logger log;

    // The TextReader we're reading from
    private TextReader textReader;

    // buffered wrap of reader
    //private BufferedTextReader bufferedReader; // was slower

    // keep track of current line number during parse
    private int lineNumber;  

    // used to back up in the stream
    private CharBuffer backString;  

    // used to collect characters of the current (next to be
    // emitted) token
    private CharBuffer nextTokenSb;

    // for speed, construct these once and re-use
    private CharBuffer tmpSb;
    private CharBuffer expSb;

    #endregion

    // ----------------------------------------------------------------------
    #region Properties
    // ----------------------------------------------------------------------

    /// <summary>
    /// This is the TextReader that this object will read from.
    /// Set this to set the input reader for the parse.
    /// </summary>
    public TextReader TextReader
    {
      get { return(textReader); }
      set { textReader = value; }
    }

    private StreamTokenizerSettings settings;
    /// <summary>
    /// The settings which govern the behavior of the tokenization.
    /// </summary>
    public StreamTokenizerSettings Settings { get { return(settings); } }

    /// <summary>
    /// The verbosity level for this object's Logger.
    /// </summary>
    public VerbosityLevel Verbosity 
    { 
      get { return(log.Verbosity); } 
      set { log.Verbosity = value; } 
    }

    #endregion

    // ---------------------------------------------------------------------
    #region Constructors/Destructor
    // ---------------------------------------------------------------------

    /// <summary>
    /// Default constructor.
    /// </summary>
    public StreamTokenizer()
    {
      Initialize();
    }

    /// <summary>
    /// Construct and set this object's TextReader to the one specified.
    /// </summary>
    /// <param name="sr">The TextReader to read from.</param>
    public StreamTokenizer(TextReader sr)
    {
      Initialize();
      textReader = sr;
    }

    /// <summary>
    /// Construct and set a string to tokenize.
    /// </summary>
    /// <param name="str">The string to tokenize.</param>
    public StreamTokenizer(string str)
    {
      Initialize();
      textReader = new StringReader(str);
    }

    /// <summary>
    /// Utility function, things common to constructors.
    /// </summary>
    void Initialize()
    {
      log = new Logger("StreamTokenizer");
      log.Verbosity = VerbosityLevel.Warn;
      backString = new CharBuffer(32);
      nextTokenSb = new CharBuffer(1024);

      InitializeStream();
      settings = new StreamTokenizerSettings();
      settings.SetDefaults();

      expSb = new CharBuffer();
      tmpSb = new CharBuffer();
    }

    /// <summary>
    /// Clear the stream settings.
    /// </summary>
    void InitializeStream()
    {
      lineNumber = 1; // base 1 line numbers
      textReader = null;
    }

    #endregion

    // ---------------------------------------------------------------------
    #region Standard Methods
    // ---------------------------------------------------------------------

    /// <summary>
    /// Display the state of this object.
    /// </summary>
    public void Display()
    {
      Display(String.Empty);
    }

    /// <summary>
    /// Display the state of this object, with a per-line prefix.
    /// </summary>
    /// <param name="prefix">The pre-line prefix.</param>
    public void Display(string prefix)
    {
      log.WriteLine(prefix + "StreamTokenizer display:");
      log.WriteLine(prefix + "    textReader: {0}", (textReader == null ? "null" : "non-null"));
      log.WriteLine(prefix + "    backString: {0}", backString);

      if (settings != null) settings.Display(prefix + "    ");
    }

    #endregion

    // ---------------------------------------------------------------------
    #region NextToken (the state machine)
    // ---------------------------------------------------------------------

    /// <summary>
    /// The states of the state machine.
    /// </summary>
    private enum NextTokenState
    {
      Start,
      Whitespace,
      Word,
      Quote,
      EndQuote,
      MaybeNumber, // could be number or word
      MaybeComment, // after first slash, might be comment or not
      MaybeHex, // after 0, may be hex
      HexGot0x, // after 0x, may be hex
      HexNumber,
      LineComment,
      BlockComment,
      EndBlockComment,
      Char,
      Eol,
      Eof,
      Invalid
    }

    /// <summary>
    /// Pick the next state given just a single character.  This is used
    /// at the start of a new token.
    /// </summary>
    /// <param name="ctype">The type of the character.</param>
    /// <param name="c">The character.</param>
    /// <returns>The state.</returns>
    private NextTokenState PickNextState(byte ctype, int c)
    {
      return(PickNextState(ctype, c, NextTokenState.Start));
    }

    /// <summary>
    /// Pick the next state given just a single character.  This is used
    /// at the start of a new token.
    /// </summary>
    /// <param name="ctype">The type of the character.</param>
    /// <param name="c">The character.</param>
    /// <param name="excludeState">Exclude this state from the possible next state.</param>
    /// <returns>The state.</returns>
    private NextTokenState PickNextState(byte ctype, int c, NextTokenState excludeState)
    {
      if (c == '/') 
      {
        return(NextTokenState.MaybeComment); // overrides all other cats
      }
      else if ((excludeState != NextTokenState.MaybeHex) 
        && settings.ParseHexNumbers && (c == '0')) 
      {
        return(NextTokenState.MaybeHex);
      }
      else if ((excludeState != NextTokenState.MaybeNumber) && settings.ParseNumbers 
        && (settings.IsCharType(ctype, CharTypeBits.Digit) || (c == '-') || (c == '.'))) 
      {
        return(NextTokenState.MaybeNumber);
      }
      else if (settings.IsCharType(ctype, CharTypeBits.Word)) return(NextTokenState.Word);
      else if (settings.GrabEol && (c == 10)) return(NextTokenState.Eol);
      else if (settings.IsCharType(ctype, CharTypeBits.Whitespace)) return(NextTokenState.Whitespace);
      else if (settings.IsCharType(ctype, CharTypeBits.Comment)) return(NextTokenState.LineComment);
      else if (settings.IsCharType(ctype, CharTypeBits.Quote)) return(NextTokenState.Quote);
      else if ((c == Eof) || (settings.IsCharType(ctype, CharTypeBits.Eof))) return(NextTokenState.Eof);
      return(NextTokenState.Char);
    }

    /// <summary>
    /// Read the next character from the stream, or from backString
    /// if we backed up.
    /// </summary>
    /// <returns>The next character.</returns>
    private int GetNextChar()
    {
      int c;

      // consume from backString if possible
      if (backString.Length > 0)
      {
        c = backString[0];
        backString.Remove(0, 1);
        #if DEBUG
        log.Debug("Backup char '{0}'", (char)c);
        #endif
        return(c);
      }

      if (textReader == null) return(Eof);

      try
      {
        while((c = textReader.Read()) == 13) {} // skip LF (13)
      }
      catch(Exception)
      {
        return(Eof);
      }

      if (c == 10) 
      {
        lineNumber++;
        #if DEBUG
        log.Debug("Line number incremented to {0}", lineNumber);
        #endif
      }
      else if (c < 0) 
      {
        c = Eof;
      }

      #if DEBUG
      log.Debug("Read char '{0}' ({1})", (char)c, c);
      #endif
      return(c);
    }

    /// <summary>
    /// Get the next token.  The last token will be an EofToken unless
    /// there's an unterminated quote or unterminated block comment
    /// and Settings.DoUntermCheck is true, in which case this throws
    /// an exception of type StreamTokenizerUntermException or sub-class.
    /// </summary>
    /// <param name="token">The output token.</param>
    /// <returns>bool - true for success, false for failure.</returns>
    public bool NextToken(out Token token)
    {
      token = null;
      int thisChar = 0; // current character
      byte ctype; // type of this character

      NextTokenState state = NextTokenState.Start;
      int prevChar = 0; // previous character
      byte prevCtype = (byte)CharTypeBits.Eof;

      // get previous char from nextTokenSb if there
      // (nextTokenSb is a StringBuilder containing the characters
      //  of the next token to be emitted)
      if (nextTokenSb.Length > 0) 
      {
        prevChar = nextTokenSb[nextTokenSb.Length - 1];
        prevCtype = settings.CharTypes[prevChar];
        state = PickNextState(prevCtype, prevChar);
      }

      // extra state for number parse
      int seenDot = 0; // how many .'s in the number
      int seenE = 0; // how many e's or E's have we seen in the number
      bool seenDigit = false; // seen any digits (numbers can start with -)

      // lineNumber can change with each GetNextChar()
      // tokenLineNumber is the line on which the token started
      int tokenLineNumber = lineNumber;

      // State Machine: Produces a single token.
      // Enter a state based on a single character.
      // Generally, being in a state means we're currently collecting chars 
      // in that type of token.
      // We do state machine until it builds a token (Eof is a token), then
      // return that token.
      thisChar = prevChar;  // for first iteration, since prevChar is set to this 
      bool done = false; // optimization
      while (!done)
      {
        prevChar = thisChar;
        thisChar = GetNextChar();
        if (thisChar >= settings.CharTypes.Length)
        {
          // greater than 7-bit ascii, treat as word character
          ctype = (byte)CharTypeBits.Word;
        }
        else ctype = settings.CharTypes[thisChar];

        #if DEBUG
        log.Debug("Before switch: state = {0}, thisChar = '{1}'", state, (char)thisChar);
        #endif

        // see if we need to change states, or emit a token
        switch(state)
        {
          case NextTokenState.Start:
            // RESET
            state = PickNextState(ctype, thisChar);
            tokenLineNumber = lineNumber;
            break;

          case NextTokenState.Char:
            token = new CharToken((char)prevChar, tokenLineNumber);
            done = true;
            nextTokenSb.Length = 0;
            break;

          case NextTokenState.Word:
            if ((!settings.IsCharType(ctype, CharTypeBits.Word))
              && (!settings.IsCharType(ctype, CharTypeBits.Digit)))
            {
              // end of word, emit
              token = new WordToken(nextTokenSb.ToString(), tokenLineNumber);
              done = true;
              nextTokenSb.Length = 0;
            }
            break;

          case NextTokenState.Whitespace:
            if (!settings.IsCharType(ctype, CharTypeBits.Whitespace)
              || (settings.GrabEol && (thisChar == 10)))
            {
              // end of whitespace, emit
              if (settings.GrabWhitespace)
              {
                token = new WhitespaceToken(nextTokenSb.ToString(), tokenLineNumber);
                done = true;
                nextTokenSb.Length = 0;
              }
              else
              {
                // RESET
                nextTokenSb.Length = 0;
                tokenLineNumber = lineNumber;
                state = PickNextState(ctype, thisChar);
              }
            }
            break;

          case NextTokenState.EndQuote:
            // we're now 1 char after end of quote
            token = new QuoteToken(nextTokenSb.ToString(), tokenLineNumber);
            done = true;
            nextTokenSb.Length = 0;
            break;

          case NextTokenState.Quote:
            // looking for end quote matching char that started the quote
            if (thisChar == nextTokenSb[0])
            {
              // handle escaped backslashes: count the immediately prior backslashes 
              // - even (including 0) means it's not escaped 
              // - odd means it is escaped 
              int backSlashCount = 0; 
              for (int i = nextTokenSb.Length - 1; i >= 0; i--)
              { 
                if (nextTokenSb[ i ] == '\\') backSlashCount++; 
                else break; 
              } 

              if ((backSlashCount % 2) == 0) 
              { 
                state = NextTokenState.EndQuote;
              }
            }

            if ((state != NextTokenState.EndQuote) && (thisChar == Eof))
            {
              if (settings.DoUntermCheck) 
              {
                nextTokenSb.Length = 0;
                throw new StreamTokenizerUntermQuoteException("Unterminated quote");
              }

              token = new QuoteToken(nextTokenSb.ToString(), tokenLineNumber);
              done = true;
              nextTokenSb.Length = 0;
            }
            break;

          case NextTokenState.MaybeComment:
            if (thisChar == Eof)
            {
              token = new CharToken(nextTokenSb.ToString(), tokenLineNumber);
              done = true;
              nextTokenSb.Length = 0;
            }
            else
            {
              // if we get the right char, we're in a comment
              if (settings.SlashSlashComments && (thisChar == '/')) 
                state = NextTokenState.LineComment;
              else if (settings.SlashStarComments && (thisChar == '*')) 
                state = NextTokenState.BlockComment;
              else
              {
                token = new CharToken(nextTokenSb.ToString(), tokenLineNumber);
                done = true;
                nextTokenSb.Length = 0;
              }
            }
            break;

          case NextTokenState.LineComment:
            if (thisChar == Eof)
            {
              if (settings.GrabComments) 
              {
                token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
                done = true;
                nextTokenSb.Length = 0;
              }
              else
              {
                // RESET
                nextTokenSb.Length = 0;
                tokenLineNumber = lineNumber;
                state = PickNextState(ctype, thisChar);
              }
            }
            else
            {
              if (thisChar == '\n')
              {
                if (settings.GrabComments) 
                {
                  token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
                  done = true;
                  nextTokenSb.Length = 0;
                }
                else
                {
                  // RESET
                  nextTokenSb.Length = 0;
                  tokenLineNumber = lineNumber;
                  state = PickNextState(ctype, thisChar);
                }
              }
            }
            break;

          case NextTokenState.BlockComment:
            if (thisChar == Eof)
            {
              if (settings.DoUntermCheck) 
              {
                nextTokenSb.Length = 0;
                throw new StreamTokenizerUntermCommentException("Unterminated comment.");
              }

              if (settings.GrabComments) 
              {
                token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
                done = true;
                nextTokenSb.Length = 0;
              }
              else
              {
                // RESET
                nextTokenSb.Length = 0;
                tokenLineNumber = lineNumber;
                state = PickNextState(ctype, thisChar);
              }
            }
            else
            {
              if ((thisChar == '/') && (prevChar == '*'))
              {
                state = NextTokenState.EndBlockComment;
              }
            }
            break;

          // special case for 2-character token termination
          case NextTokenState.EndBlockComment:
            if (settings.GrabComments) 
            {
              token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
              done = true;
              nextTokenSb.Length = 0;
            }
            else
            {
              // RESET
              nextTokenSb.Length = 0;
              tokenLineNumber = lineNumber;
              state = PickNextState(ctype, thisChar);
            }
            break;

          case NextTokenState.MaybeHex:
            // previous char was 0
            if (thisChar != 'x')
            {
              // back up and try non-hex
              // back up to the 0
              nextTokenSb.Append((char)thisChar);
              backString.Append(nextTokenSb);
              nextTokenSb.Length = 0;

              // reset state and don't choose MaybeNumber state.
              // pull char from backString
              thisChar = backString[0];
              backString.Remove(0, 1);
              state = PickNextState(settings.CharTypes[thisChar], (int)thisChar, 
                NextTokenState.MaybeHex);
              #if DEBUG
              log.Debug("HexGot0x: Next state on '{0}' is {1}", (char)thisChar,
                state);
              #endif
            }
            else state = NextTokenState.HexGot0x;
            break;

          case NextTokenState.HexGot0x:
            if (!settings.IsCharType(ctype, CharTypeBits.HexDigit))
            {
              // got 0x but now a non-hex char
              // back up to the 0
              nextTokenSb.Append((char)thisChar);
              backString.Append(nextTokenSb);
              nextTokenSb.Length = 0;

              // reset state and don't choose MaybeNumber state.
              // pull char from backString
              thisChar = backString[0];
              backString.Remove(0, 1);
              state = PickNextState(settings.CharTypes[thisChar], (int)thisChar, 
                NextTokenState.MaybeHex);
              #if DEBUG
              log.Debug("HexGot0x: Next state on '{0}' is {1}", (char)thisChar,
                state);
              #endif
            }
            else state = NextTokenState.HexNumber;
            break;

          case NextTokenState.HexNumber:
            if (!settings.IsCharType(ctype, CharTypeBits.HexDigit))
            {
              // emit the hex number we've collected
              #if DEBUG
              log.Debug("Emit hex IntToken from string '{0}'", nextTokenSb);
              #endif
              token = IntToken.ParseHex(nextTokenSb.ToString(), tokenLineNumber);
              done = true;
              nextTokenSb.Length = 0;
            }
            break;

          case NextTokenState.MaybeNumber:
            //
            // Determine whether or not to stop collecting characters for
            // the number parse.  We terminate when it's clear it's not
            // a number or no longer a number.
            //
            bool term = false;

            if (settings.IsCharType(ctype, CharTypeBits.Digit)  
              || settings.IsCharType(prevChar, CharTypeBits.Digit)) seenDigit = true;

            // term conditions
            if (thisChar == '.') 
            { 
              seenDot++; 
              if (seenDot > 1) term = true;  // more than one dot, it aint a number
            }
            else if (((thisChar == 'e') || (thisChar == 'E')))
            {
              seenE++;
              if (!seenDigit) term = true;  // e before any digits is bad
              else if (seenE > 1) term = true;  // more than 1 e is bad
              else
              {
                term = true; // done regardless

                // scan the exponent, put its characters into
                // nextTokenSb, if there are any
                char c;
                expSb.Clear();
                expSb.Append((char)thisChar);
                if (GrabInt(expSb, true, out c))
                {
                  // we got a good exponent, tack it on
                  nextTokenSb.Append(expSb);
                  thisChar = c; // and continue after the exponent's characters
                }
              }
            }
            else if (thisChar == Eof) term = true;  
              // or a char that can't be in a number
            else if ((!settings.IsCharType(ctype, CharTypeBits.Digit) 
              && (thisChar != 'e') && (thisChar != 'E') 
              && (thisChar != '-') && (thisChar != '.')) 
              || ((thisChar == '+') && (seenE == 0)))
            {
              // it's not a normal number character
              term = true;
            }
            // or a dash not after e
            else if ((thisChar == '-') && (!((prevChar == 'e') || (prevChar == 'E')))) term = true;

            if (term)
            {
              // we are terminating a number, or it wasn't a number
              if (seenDigit)
              {
                if ((nextTokenSb.IndexOf('.') >= 0)
                  || (nextTokenSb.IndexOf('e') >= 0)
                  || (nextTokenSb.IndexOf('E') >= 0)
                  || (nextTokenSb.Length >= 19) // probably too large for Int64, use float
                  )
                {
                  token = new FloatToken(nextTokenSb.ToString(), tokenLineNumber);
                  #if DEBUG
                  log.Debug("Emit FloatToken from string '{0}'", nextTokenSb);
                  #endif
                }
                else 
                {
                  #if DEBUG
                  log.Debug("Emit IntToken from string '{0}'", nextTokenSb);
                  #endif
                  token = new IntToken(nextTokenSb.ToString(), tokenLineNumber);
                }
                done = true;
                nextTokenSb.Length = 0;
              }
              else
              {
                // -whatever or -.whatever
                // didn't see any digits, must have gotten here by a leading -
                // and no digits after it
                // back up to -, pick next state excluding numbers
                nextTokenSb.Append((char)thisChar);
                backString.Append(nextTokenSb);
                nextTokenSb.Length = 0;

                // restart on the - and don't choose MaybeNumber state
                // pull char from backString
                thisChar = backString[0];
                backString.Remove(0, 1);
                state = PickNextState(settings.CharTypes[thisChar], (int)thisChar, 
                  NextTokenState.MaybeNumber);
                #if DEBUG
                log.Debug("MaybeNumber: Next state on '{0}' is {1}", (char)thisChar,
                  state);
                #endif
              }
            }
            break;

          case NextTokenState.Eol:
            // tokenLineNumber - 1 because the newline char is on the previous line
            token = new EolToken(tokenLineNumber - 1);
            done = true;
            nextTokenSb.Length = 0;
            break;

          case NextTokenState.Eof:
            token = new EofToken(tokenLineNumber);
            done = true;
            nextTokenSb.Length = 0;
            return(false);

          case NextTokenState.Invalid:
          default:
            // not a good sign, some unrepresented state?
            log.Error("NextToken: Hit unrepresented state {0}", state);
            return(false);
        }

        // use a StringBuilder to accumulate characters which are part of this token
        if (thisChar != Eof) nextTokenSb.Append((char)thisChar);
        #if DEBUG
        log.Debug("After switch: state = {0}, nextTokenSb = '{1}', backString = '{2}'", 
          state, nextTokenSb, backString);
        #endif
      }

      #if DEBUG
      log.Debug("Got token {0}", token.ToDebugString());
      #endif
      return(true);
    }

    /// <summary>
    /// Starting from current stream location, scan forward
    /// over an int.  Determine whether it's an integer or not.  If so, 
    /// push the integer characters to the specified CharBuffer.  
    /// If not, put them in backString (essentially leave the
    /// stream as it was) and return false.
    /// <para>
    /// If it was an int, the stream is left 1 character after the
    /// end of the int, and that character is output in the thisChar parameter.
    /// </para>
    /// <para>The formats for integers are: 1, +1, and -1</para>
    /// The + and - signs are included in the output buffer.
    /// </summary>
    /// <param name="sb">The CharBuffer to append to.</param>
    /// <param name="allowPlus">Whether or not to consider + to be part
    /// of an integer.</param>
    /// <param name="thisChar">The last character read by this method.</param>
    /// <returns>true for parsed an int, false for not an int</returns>
    private bool GrabInt(CharBuffer sb, bool allowPlus, out char thisChar)
    {
      tmpSb.Clear(); // use tmp CharBuffer

      // first character can be -, maybe can be + depending on arg
      thisChar = (char)GetNextChar();
      if (thisChar == Eof)
      {
        return(false);
      }
      else if (thisChar == '+')
      {
        if (allowPlus) 
        {
          tmpSb.Append(thisChar);
        }
        else
        {
          backString.Append(thisChar);
          return(false);
        }
      }
      else if (thisChar == '-')
      {
        tmpSb.Append(thisChar);
      }
      else if (settings.IsCharType(thisChar, CharTypeBits.Digit))
      {
        // a digit, back this out so we can handle it in loop below
        backString.Append(thisChar);
      }
      else 
      {
        // not a number starter
        backString.Append(thisChar);
        return(false);
      }

      // rest of chars have to be digits
      bool gotInt = false;
      while(((thisChar = (char)GetNextChar()) != Eof)
        && (settings.IsCharType(thisChar, CharTypeBits.Digit)))
      {
        gotInt = true;
        tmpSb.Append(thisChar);
      }

      if (gotInt) 
      {
        sb.Append(tmpSb);
        #if DEBUG
        log.Debug("Grabbed int {0}, sb = {1}", tmpSb, sb);
        #endif
        return(true);
      }
      else 
      {
        // didn't get any chars after first 
        backString.Append(tmpSb); // put + or - back on
        if (thisChar != Eof) backString.Append(thisChar);
        return(false);
      }
    }

    #endregion

    // ---------------------------------------------------------------------
    #region Tokenize wrapper methods
    // ---------------------------------------------------------------------

    /// <summary>
    /// Parse the rest of the stream and put all the tokens
    /// in the input ArrayList. This resets the line number to 1.
    /// </summary>
    /// <param name="tokens">The ArrayList to append to.</param>
    /// <returns>bool - true for success</returns>
    public bool Tokenize(ICollection<Token> tokens)
    {
      Token token;
      this.lineNumber = 1;

      while (NextToken(out token))
      {
        if (token == null) throw new NullReferenceException(
            "StreamTokenizer: Tokenize: Got a null token from NextToken.");
        tokens.Add(token);
      }

      // add the last token returned (EOF)
      tokens.Add(token);
      return(true);
    }

    /// <summary>
    /// Parse all tokens from the specified TextReader, put
    /// them into the input ArrayList.
    /// </summary>
    /// <param name="tr">The TextReader to read from.</param>
    /// <param name="tokens">The ArrayList to append to.</param>
    /// <returns>bool - true for success, false for failure.</returns>
    public bool TokenizeReader(TextReader tr, ICollection<Token> tokens)
    {
      textReader = tr;
      return(Tokenize(tokens));
    }

    /// <summary>
    /// Parse all tokens from the specified file, put
    /// them into the input ArrayList.
    /// </summary>
    /// <param name="fileName">The file to read.</param>
    /// <param name="tokens">The ArrayList to put tokens in.</param>
    /// <returns>bool - true for success, false for failure.</returns>
    public bool TokenizeFile(string fileName, ICollection<Token> tokens)
    {
      FileInfo fi = new FileInfo(fileName);
      FileStream fr = null;
      try 
      {
        fr = fi.Open(FileMode.Open, FileAccess.Read, FileShare.None);
        textReader = new StreamReader(fr);
      } 
      catch (DirectoryNotFoundException)
      {
      }
      try
      {
        if (!Tokenize(tokens))
        {
          log.Error("Unable to parse tokens from file {0}", fileName);
          textReader.Close();
          if (fr != null) fr.Close();
          return(false);
        }
      }
      catch(StreamTokenizerUntermException e)
      {
        textReader.Close();
        if (fr != null) fr.Close();
        throw e;
      }

      if (textReader != null) textReader.Close();
      if (fr != null) fr.Close();
      return(true);
    }

    /// <summary>
    /// Parse all tokens from the specified string, put
    /// them into the input ArrayList.
    /// </summary>
    /// <param name="str"></param>
    /// <param name="tokens">The ArrayList to put tokens in.</param>
    /// <returns>bool - true for success, false for failure.</returns>
    public bool TokenizeString(string str, ICollection<Token> tokens)
    {
      textReader = new StringReader(str);
      return(Tokenize(tokens));
    }

    /// <summary>
    /// Parse all tokens from the specified Stream, put
    /// them into the input ArrayList.
    /// </summary>
    /// <param name="s"></param>
    /// <param name="tokens">The ArrayList to put tokens in.</param>
    /// <returns>bool - true for success, false for failure.</returns>
    public bool TokenizeStream(Stream s, ICollection<Token> tokens)
    {
      textReader = new StreamReader(s);
      return(Tokenize(tokens));
    }

    /// <summary>
    /// Tokenize a file completely and return the tokens in a Token[].
    /// </summary>
    /// <param name="fileName">The file to tokenize.</param>
    /// <returns>A Token[] with all tokens.</returns>
    public Token[] TokenizeFile(string fileName)
    {
      var list = new List<Token>();
      if (!TokenizeFile(fileName, list))
      {
        return(null);
      }
      else
      {
        if (list.Count > 0)
        {
          return(list.ToArray());
        }
        else return(null);
      }
    }
    #endregion    
  }
}
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.