Parser.cs : » PDF » PDF-Clown » it » stefanochizzolini » clown » tokens » C# / CSharp Open Source

1.	2.6.4 mono .net core
2.	2.6.4 mono core
3.	Aspect Oriented Frameworks
4.	Bloggers
5.	Build Systems
6.	Business Application
7.	Charting Reporting Tools
8.	Chat Servers
9.	Code Coverage Tools
10.	Content Management Systems CMS
11.	CRM ERP
12.	Database
13.	Development
14.	Email
15.	Forum
16.	Game
17.	GIS
18.	GUI
19.	IDEs
20.	Installers Generators
21.	Inversion of Control Dependency Injection
22.	Issue Tracking
23.	Logging Tools
24.	Message
25.	Mobile
26.	Network Clients
27.	Network Servers
28.	Office
29.	PDF
30.	Persistence Frameworks
31.	Portals
32.	Profilers
33.	Project Management
34.	RSS RDF
35.	Rule Engines
36.	Script
37.	Search Engines
38.	Sound Audio
39.	Source Control
40.	SQL Clients
41.	Template Engines
42.	Testing
43.	UML
44.	Web Frameworks
45.	Web Service
46.	Web Testing
47.	Wiki Engines
48.	Windows Presentation Foundation
49.	Workflows
50.	XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » PDF » PDF Clown
PDF Clown » it » stefanochizzolini » clown » tokens » Parser.cs
/*
  Copyright 2006,2007,2008 Stefano Chizzolini. http://clown.stefanochizzolini.it

  Contributors:
    * Stefano Chizzolini (original code developer, http://www.stefanochizzolini.it)
    * Haakan Aakerberg (bugfix contributor):
      - [FIX:0.0.4:1]
      - [FIX:0.0.4:4]

  This file should be part of the source code distribution of "PDF Clown library"
  (the Program): see the accompanying README files for more info.

  This Program is free software; you can redistribute it and/or modify it under
  the terms of the GNU General Public License as published by the Free Software
  Foundation; either version 2 of the License, or (at your option) any later version.

  This Program is distributed in the hope that it will be useful, but WITHOUT ANY
  WARRANTY, either expressed or implied; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the License for more details.

  You should have received a copy of the GNU General Public License along with this
  Program (see README files); if not, go to the GNU website (http://www.gnu.org/).

  Redistribution and use, with or without modification, are permitted provided that such
  redistributions retain the above copyright notice, license and disclaimer, along with
  this list of conditions.
*/

using it.stefanochizzolini.clown.bytes;
using it.stefanochizzolini.clown.documents;
using it.stefanochizzolini.clown.files;
using it.stefanochizzolini.clown.objects;

using System;
using System.Globalization;
using System.IO;
using System.Text;

namespace it.stefanochizzolini.clown.tokens{
  /**
    <summary>Token parser.</summary>
  */
  public class Parser : IDisposable
  {
    #region types
    public struct Reference
    {
      #region fields
      public readonly int ObjectNumber;
      public readonly int GenerationNumber;
      #endregion

      #region constructors
      internal Reference(
        int objectNumber,
        int generationNumber
        )
      {
        this.ObjectNumber = objectNumber;
        this.GenerationNumber = generationNumber;
      }
      #endregion
    }
    #endregion

    #region static
    #region fields
    private static readonly string PdfHeader = "%PDF-";

    private static readonly Encoding ISO88591Encoding = Encoding.GetEncoding("iso-8859-1");

    private static readonly NumberFormatInfo StandardNumberFormatInfo = NumberFormatInfo.InvariantInfo;
    #endregion

    #region interface
    #region protected
    protected static int GetHex(
      int c
      )
    {
      if(c >= '0' && c <= '9')
        return (c - '0');
      if(c >= 'A' && c <= 'F')
        return (c - 'A' + 10);
      if(c >= 'a' && c <= 'f')
        return (c - 'a' + 10);
      return -1;
    }

    /**
      <summary>Evaluate whether a character is a delimiter [PDF:1.6:3.1.1].</summary>
    */
    protected static bool IsDelimiter(
      int c
      )
    {return (c == '(' || c == ')' || c == '<' || c == '>' || c == '[' || c == ']' || c == '/' || c == '%');}

    /**
      <summary>Evaluate whether a character is an EOL marker [PDF:1.6:3.1.1].</summary>
    */
    protected static bool IsEOL(
      int c
      )
    {return (c == 12 || c == 15);}

    /**
      <summary>Evaluate whether a character is a white-space [PDF:1.6:3.1.1].</summary>
    */
    protected static bool IsWhitespace(
      int c
      )
    {return (c == 0 || c == 9 || c == 10 || c == 12 || c == 13 || c == 32);}
    #endregion
    #endregion
    #endregion

    #region dynamic
    #region fields
    private files.File file;
    private IInputStream stream;
    private object token;
    private TokenTypeEnum tokenType;

    private bool multipleTokenParsing;
    #endregion

    #region constructors
    internal Parser(
      IInputStream stream,
      files.File file
      )
    {
      this.stream = stream;
      this.file = file;
    }
    #endregion

    #region interface
    #region public
    public override int GetHashCode(
      )
    {return stream.GetHashCode();}

    public long Length
    {get{return stream.Length;}}

    /**
      <param name="offset">Number of tokens to be skipped before reaching the
      intended one.</param>
    */
    public bool MoveNext(
      int offset
      )
    {
      for(
        int index = 0;
        index < offset;
        index++
        )
      {
        if(!MoveNext())
          return false;
      }

      return true;
    }

    /**
      <summary>Parse the next token [PDF:1.6:3.1].</summary>
      <remarks>
        <para>Contract:<ul>
          <li>Preconditions:
            <ol>
            <li>To properly parse the current token, the pointer MUST be just before its starting (leading whitespaces are ignored).</li>
            </ol>
          </li>
          <li>Postconditions:
            <ol>
            <li id="moveNext_contract_post[0]">When this method terminates, the pointer IS at the last byte of the current token.</li>
            </ol>
          </li>
          <li>Invariants:
            <ol>
            <li>The byte-level position of the pointer IS anytime (during token parsing) at the end of the current token (whereas the 'current token' represents the token-level position of the pointer).</li>
            </ol>
          </li>
          <li>Side-effects:
            <ol>
            <li>See <a href="#moveNext_contract_post[0]">Postconditions</a>.</li>
            </ol>
          </li>
        </ul></para>
      </remarks>
      <returns>Whether a new token was found.</returns>
    */
    public bool MoveNext(
      )
    {
      /*
        NOTE: It'd be interesting to evaluate an alternative regular-expression-based
        implementation...
      */
      StringBuilder buffer = null;
      token = null;
      int c = 0;

      // Skip white-space characters [PDF:1.6:3.1.1].
      do
      {
        c = stream.ReadByte();
        if(c == -1)
          return false;
      } while(IsWhitespace(c)); // Keep goin' till there's a white-space character...

      // Which character is it?
      switch(c)
      {
        case '/': // Name.
          tokenType = TokenTypeEnum.Name;

          buffer = new StringBuilder();
          while(true)
          {
            c = stream.ReadByte();
            if(c == -1)
              throw new FileFormatException("Unexpected EOF (malformed name object).",stream.Position);
            if(IsDelimiter(c) || IsWhitespace(c))
              break;

            buffer.Append((char)c);
          }
          stream.Skip(-1); // Recover the first byte after the current token.

          break;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
        case '.':
        case '-':
        case '+': // Number [PDF:1.6:3.2.2] | Indirect reference.
          switch(c)
          {
            case '.': // Decimal point.
              tokenType = TokenTypeEnum.Real;
              break;
            case '-':
            case '+': // Signum.
              tokenType = TokenTypeEnum.Integer; // By default (it may be real).
              break;
            default: // Digit.
              if(multipleTokenParsing) // Plain number (multiple token parsing -- see indirect reference search).
              {
                tokenType = TokenTypeEnum.Integer; // By default (it may be real).
              }
              else // Maybe an indirect reference (postfix notation [PDF:1.6:3.2.9]).
              {
                /*
                  NOTE: We need to identify this pattern:
                  ref :=  { int int 'R' }
                */
                // Enable multiple token parsing!
                // NOTE: This state MUST be disabled before returning.
                multipleTokenParsing = true;

                // 1. Object number.
                // Try the possible object number!
                stream.Skip(-1); MoveNext();
                // Isn't it a valid object number?
                if(tokenType != TokenTypeEnum.Integer)
                {
                  // Disable multiple token parsing!
                  multipleTokenParsing = false;
                  return true;
                }
                // Assign object number!
                int objectNumber = (int)token;
                // Backup the recovery position!
                long oldOffset = stream.Position;

                // 2. Generation number.
                // Try the possible generation number!
                MoveNext();
                // Isn't it a valid generation number?
                if(tokenType != TokenTypeEnum.Integer)
                {
                  // Rollback!
                  stream.Seek(oldOffset);
                  token = objectNumber; tokenType = TokenTypeEnum.Integer;
                  // Disable multiple token parsing!
                  multipleTokenParsing = false;
                  return true;
                }
                // Assign generation number!
                int generationNumber = (int)token;

                // 3. Reference keyword.
                // Try the possible reference keyword!
                MoveNext();
                // Isn't it a valid reference keyword?
                if(tokenType != TokenTypeEnum.Reference)
                {
                  // Rollback!
                  stream.Seek(oldOffset);
                  token = objectNumber; tokenType = TokenTypeEnum.Integer;
                  // Disable multiple token parsing!
                  multipleTokenParsing = false;
                  return true;
                }
                token = new Reference(objectNumber,generationNumber);
                // Disable multiple token parsing!
                multipleTokenParsing = false;
                return true;
              }
              break;
          }

          // Building the number...
          buffer = new StringBuilder();
          do
          {
            buffer.Append((char)c);
            c = stream.ReadByte();
            if(c == -1)
              throw new FileFormatException("Unexpected EOF (malformed number object).",stream.Position);
            if(c == '.')
              tokenType = TokenTypeEnum.Real;
            else if(c < '0' || c > '9')
              break;
          } while(true);

          stream.Skip(-1); // Recover the first byte after the current token.

          break;
        case '[': // Array (begin).
          tokenType = TokenTypeEnum.ArrayBegin;

          break;
        case ']': // Array (end).
          tokenType = TokenTypeEnum.ArrayEnd;

          break;
        case '<': // Dictionary (begin) | Hexadecimal string.
          c = stream.ReadByte();
          if(c == -1)
            throw new FileFormatException("Unexpected EOF (isolated opening angle-bracket character).",stream.Position);
          // Is it a dictionary (2nd angle bracket [PDF:1.6:3.2.6])?
          if(c == '<')
          {
            tokenType = TokenTypeEnum.DictionaryBegin;
            break;
          }

          // Hexadecimal string (single angle bracket [PDF:1.6:3.2.3]).
          tokenType = TokenTypeEnum.Hex;

          // [FIX:0.0.4:4] It skipped after the first hexadecimal character, missing it.
          buffer = new StringBuilder();
          while(c != '>') // NOT string end.
          {
            buffer.Append((char)c);

            c = stream.ReadByte();
            if(c == -1)
              throw new FileFormatException("Unexpected EOF (malformed hex string).",stream.Position);
          }

          break;
        case '>': // Dictionary (end).
          c = stream.ReadByte();
          if(c != '>')
            throw new FileFormatException("Malformed dictionary.",stream.Position);

          tokenType = TokenTypeEnum.DictionaryEnd;

          break;
        case '%': // Comment.
          tokenType = TokenTypeEnum.Comment;
          // Skipping comment content...
          do
          {
            c = stream.ReadByte();
            if(c == -1)
              break;
          } while(!IsEOL(c));

          break;
        case '(': // Literal string.
          tokenType = TokenTypeEnum.Literal;

          buffer = new StringBuilder();
          int level = 0;
          while(true)
          {
            c = stream.ReadByte();
            if(c == -1)
              break;
            if(c == '(')
              level++;
            else if(c == ')')
              level--;
            else if(c == '\\')
            {
              bool lineBreak = false;
              c = stream.ReadByte();
              switch(c)
              {
                case 'n':
                  c = '\n';
                  break;
                case 'r':
                  c = '\r';
                  break;
                case 't':
                  c = '\t';
                  break;
                case 'b':
                  c = '\b';
                  break;
                case 'f':
                  c = '\f';
                  break;
                case '(':
                case ')':
                case '\\':
                  break;
                case '\r':
                  lineBreak = true;
                  c = stream.ReadByte();
                  if(c != '\n')
                    stream.Skip(-1);
                  break;
                case '\n':
                  lineBreak = true;
                  break;
                default:
                {
                  // Is it outside the octal encoding?
                  if(c < '0' || c > '7') break;

                  // Octal [PDF:1.6:3.2.3].
                  int octal = c - '0';
                  c = stream.ReadByte();
                  // Octal end?
                  if(c < '0' || c > '7')
                  {c = octal; stream.Skip(-1); break;}
                  octal = (octal << 3) + c - '0';
                  c = stream.ReadByte();
                  // Octal end?
                  if(c < '0' || c > '7')
                  {c = octal; stream.Skip(-1); break;}
                  octal = (octal << 3) + c - '0';
                  c = octal & 0xff;
                  break;
                }
              }
              if(lineBreak)
                continue;
              if(c == -1)
                break;
            }
            else if(c == '\r')
            {
              c = stream.ReadByte();
              if(c == -1)
                break;
              if(c != '\n')
              {c = '\n'; stream.Skip(-1);}
            }
            if(level == -1)
              break;

            buffer.Append((char)c);
          }
          if(c == -1)
            throw new FileFormatException("Malformed literal string.",stream.Position);

          break;
        case 'R': // Indirect reference.
          tokenType = TokenTypeEnum.Reference;

          break;
        default: // Keyword object.
          tokenType = TokenTypeEnum.Keyword;

          buffer = new StringBuilder();
          do
          {
            buffer.Append((char)c);
            c = stream.ReadByte();
            if(c == -1)
              break;
          } while(!IsDelimiter(c) && !IsWhitespace(c));
          stream.Skip(-1); // Recover the first byte after the current token.

          break;
      }

      if(buffer != null)
      {
        /*
          Here we prepare the current token state.
        */
        // Wich token type?
        switch(tokenType)
        {
          case TokenTypeEnum.Keyword:
            token = buffer.ToString();
            // Late recognition.
            switch((string)token)
            {
              case "false":
              case "true": // Boolean.
                tokenType = TokenTypeEnum.Boolean;
                token =  bool.Parse((string)token);
                break;
              case "null": // Null.
                tokenType = TokenTypeEnum.Null;
                token = null;
                break;
            }
            break;
          case TokenTypeEnum.Comment:
          case TokenTypeEnum.Hex:
          case TokenTypeEnum.Name:
            token = buffer.ToString();
            break;
          case TokenTypeEnum.Literal:
            token = buffer.ToString();
            // Late recognition.
            if(((string)token).StartsWith("D:")) // Date.
            {
              tokenType = TokenTypeEnum.Date;
              token = PdfDate.ToDateTime((string)token);
            }
            break;
          case TokenTypeEnum.Integer:
            token = Int32.Parse(
              buffer.ToString(),
              NumberStyles.Integer,
              StandardNumberFormatInfo
              );
            break;
          case TokenTypeEnum.Real:
            // [FIX:1668410] Parsing of float numbers was buggy (localized default number format).
            token = Single.Parse(
              buffer.ToString(),
              NumberStyles.Float,
              StandardNumberFormatInfo
              );
            break;
        }
      }

      return true;
    }

    /**
      <remarks>
        <para>Require[0]: when this method is invoked, the pointer MUST be at (the end of) the first
        token of the object.</para>
        <para>Ensure[0]: when this method terminates, the pointer IS at (the end of) the last token of the object.</para>
        <para>Invariant[0]: stream data IS kept untouched.</para>
        <para>Side effect[0]: see Ensure[0].</para>
      </remarks>
    */
    public PdfDataObject ParsePdfObject(
      )
    {
      /*
        NOTE: Object parsing is intrinsically a sequential operation tied to the stream pointer.
        Calls bound towards other classes are potentially disruptive for the predictability of
        the position of the stream pointer, so we are forced to carefully keep track of our
        current position in order to recover its proper state after any outbound call.
      */

      // Which token type?
      switch(tokenType)
      {
        case TokenTypeEnum.Integer:
          return new PdfInteger((int)token);
        case TokenTypeEnum.Name:
          return new PdfName((string)token,true);
        case TokenTypeEnum.Reference:
          /*
            NOTE: Curiously, PDF references are the only primitive objects that require
            a file reference. That's because they deal with indirect objects, which are strongly
            coupled with the current state of the file: so, PDF references are the fundamental
            bridge between the token layer and the file layer.
         */
          return new PdfReference(
            (Reference)token,
            file
            );
        case TokenTypeEnum.Literal:
          return new PdfTextString(
            ISO88591Encoding.GetBytes((string)token)
            );
        case TokenTypeEnum.DictionaryBegin:
          PdfDictionary dictionary = new PdfDictionary();
          // Populate the dictionary.
          while(true)
          {
            // Key.
            MoveNext();
            if(tokenType == TokenTypeEnum.DictionaryEnd)
              break;
            PdfName key = (PdfName)ParsePdfObject();

            // Value.
            MoveNext();
            PdfDirectObject value = (PdfDirectObject)ParsePdfObject();

            // Add the current entry to the dictionary!
            dictionary[key] = value;
          }

          int oldOffset = (int)stream.Position;
          MoveNext();
          // Is this dictionary the header of a stream object [PDF:1.6:3.2.7]?
          if((tokenType == TokenTypeEnum.Keyword)
            && token.Equals("stream"))
          {
            // Keep track of current position!
            long position = stream.Position;

            // Get the stream length!
            /*
              NOTE: Indirect reference resolution is an outbound call (stream pointer hazard!),
              so we need to recover our current position after it returns.
            */
            int length = ((PdfInteger)files.File.Resolve(dictionary[PdfName.Length])).RawValue;

            // Come back to current position!
            stream.Seek(position);

            SkipWhitespace();

            // Copy the stream data to the instance!
            byte[] data = new byte[length];
            stream.Read(data);

            MoveNext(); // Postcondition (last token should be 'endstream' keyword).

            return new PdfStream(
              dictionary,
              new bytes.Buffer(data)
              );
          }
          else
          {
            stream.Seek(oldOffset); // Restore postcondition (last token should be the dictionary end).

            return dictionary;
          }
        case TokenTypeEnum.ArrayBegin:
          PdfArray array = new PdfArray();
          // Populate the array.
          while(true)
          {
            // Value.
            MoveNext();
            if(tokenType == TokenTypeEnum.ArrayEnd)
              break;

            // Add the current item to the array!
            array.Add((PdfDirectObject)ParsePdfObject());
          }
          return array;
        case TokenTypeEnum.Real:
          return new PdfReal((float)token);
        case TokenTypeEnum.Boolean:
          return new PdfBoolean((bool)token);
        case TokenTypeEnum.Date:
          return new PdfDate((DateTime)token);
        case TokenTypeEnum.Hex:
          return new PdfString(
            ISO88591Encoding.GetBytes((string)token),
            PdfString.SerializationModeEnum.Hex
            );
        case TokenTypeEnum.Null:
          return null;
        default:
          throw new Exception("Unknown type: " + tokenType);
      }
    }

    /**
      <summary>Retrieves the PDF version of the file [PDF:1.6:3.4.1].</summary>
    */
    public string RetrieveVersion(
      )
    {
      stream.Seek(0);
      string header = stream.ReadString(10);
      if(!header.StartsWith(PdfHeader))
        throw new FileFormatException("PDF header not found.",stream.Position);

      return header.Substring(PdfHeader.Length,3);
    }

    /**
      <summary>Retrieves the starting position of the last xref-table
      section.</summary>
    */
    public long RetrieveXRefOffset(
      )
    {return RetrieveXRefOffset(stream.Length);}

    /**
      <summary>Retrieves the starting position of an xref-table section
      [PDF:1.6:3.4.4].</summary>
      <remarks>
        <para>Require: offset parameter MUST be the position of the EOF marker related to the intended section to be parsed.</para>
      </remarks>
    */
    public long RetrieveXRefOffset(
      long offset
      )
    {
      const int chunkSize = 1024; // [PDF:1.6:H.3.18].

      // Move back before 'startxref' keyword!
      long position = offset - chunkSize;
      if (position < 0)
      {position = 0;} // [FIX:0.0.4:1] It failed to deal with less-than-1024-byte-long PDF files.
      stream.Seek(position);

      // Get 'startxref' keyword position!
      int index = stream.ReadString(chunkSize).LastIndexOf("startxref");
      if(index < 0)
        throw new FileFormatException("PDF startxref not found.",stream.Position);
      // Go past the startxref keyword!
      stream.Seek(position + index); MoveNext();
      // Go to the xref offset!
      MoveNext();
      if(tokenType != TokenTypeEnum.Integer)
        throw new FileFormatException("PDF startxref malformed.",stream.Position);

      return (int)token;
    }

    public long Position
    {get{return stream.Position;}}

    public void Seek(
      long offset
      )
    {stream.Seek(offset);}

    public void Skip(
      long offset
      )
    {stream.Skip(offset);}

    /**
      <summary>Move to the last whitespace after the current position in order
      to let read the first non-whitespace.</summary>
    */
    public bool SkipWhitespace(
      )
    {
      int b;
      do
      {
        b = stream.ReadByte();
        if(b == -1)
          return false;
      } while(IsWhitespace(b)); // Keep goin' till there's a white-space character...
      // Recover the last whitespace position!
      stream.Skip(-1); // Recover the last whitespace position.

      return true;
    }

    public IInputStream Stream
    {get{return stream;}}

    /**
      <summary>Gets the currently-parsed token.</summary>
      <returns>The current token.</returns>
    */
    public object Token
    {get{return token;}}

    /**
      <summary>Gets the currently-parsed token type.</summary>
      <returns>The current token type.</returns>
    */
    public TokenTypeEnum TokenType
    {get{return tokenType;}}

    #region IDisposable
    public void Dispose(
      )
    {
      if(stream != null)
      {
        stream.Dispose();
        stream = null;
      }

      GC.SuppressFinalize(this);
    }
    #endregion
    #endregion
    #endregion
    #endregion
  }
}
www.java2v.com | Contact Us
All other trademarks are property of their respective owners.