Parser.cs : » PDF » PDF-Clown » it » stefanochizzolini » clown » documents » contents » tokens » C# / CSharp Open Source

1.	2.6.4 mono .net core
2.	2.6.4 mono core
3.	Aspect Oriented Frameworks
4.	Bloggers
5.	Build Systems
6.	Business Application
7.	Charting Reporting Tools
8.	Chat Servers
9.	Code Coverage Tools
10.	Content Management Systems CMS
11.	CRM ERP
12.	Database
13.	Development
14.	Email
15.	Forum
16.	Game
17.	GIS
18.	GUI
19.	IDEs
20.	Installers Generators
21.	Inversion of Control Dependency Injection
22.	Issue Tracking
23.	Logging Tools
24.	Message
25.	Mobile
26.	Network Clients
27.	Network Servers
28.	Office
29.	PDF
30.	Persistence Frameworks
31.	Portals
32.	Profilers
33.	Project Management
34.	RSS RDF
35.	Rule Engines
36.	Script
37.	Search Engines
38.	Sound Audio
39.	Source Control
40.	SQL Clients
41.	Template Engines
42.	Testing
43.	UML
44.	Web Frameworks
45.	Web Service
46.	Web Testing
47.	Wiki Engines
48.	Windows Presentation Foundation
49.	Workflows
50.	XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » PDF » PDF Clown
PDF Clown » it » stefanochizzolini » clown » documents » contents » tokens » Parser.cs
/*
  Copyright 2006,2007,2008 Stefano Chizzolini. http://clown.stefanochizzolini.it

  Contributors:
    * Stefano Chizzolini (original code developer, http://www.stefanochizzolini.it)
    * Haakan Aakerberg (bugfix contributor):
      - [FIX:0.0.4:4]

  This file should be part of the source code distribution of "PDF Clown library"
  (the Program): see the accompanying README files for more info.

  This Program is free software; you can redistribute it and/or modify it under
  the terms of the GNU General Public License as published by the Free Software
  Foundation; either version 2 of the License, or (at your option) any later version.

  This Program is distributed in the hope that it will be useful, but WITHOUT ANY
  WARRANTY, either expressed or implied; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the License for more details.

  You should have received a copy of the GNU General Public License along with this
  Program (see README files); if not, go to the GNU website (http://www.gnu.org/).

  Redistribution and use, with or without modification, are permitted provided that such
  redistributions retain the above copyright notice, license and disclaimer, along with
  this list of conditions.
*/

using bytesit.stefanochizzolini.clown.bytes;
using it.stefanochizzolini.clown.documents.contents.objects;
using it.stefanochizzolini.clown.objects;
using it.stefanochizzolini.clown.tokens;

using System;
using System.Collections.Generic;
using System.Globalization;
using ioSystem.IO;
using System.Text;

namespace it.stefanochizzolini.clown.documents.contents.tokens{
  /**
    <summary>Content stream parser [PDF:1.6:3.7.1].</summary>
  */
  public class Parser
  {
/*
TODO:IMPL this parser evaluates a subset of the lexical domain of the token parser (clown.serialization.Parser): it should be better to derive both parsers from a common parsing engine in order to avoid unwieldy duplications.
*/
    #region static
    #region fields
    private static readonly Encoding ISO88591Encoding = Encoding.GetEncoding("iso-8859-1");

    private static readonly NumberFormatInfo StandardNumberFormatInfo = NumberFormatInfo.InvariantInfo;
    #endregion

    #region interface
    #region protected
    protected static int GetHex(
      int c
      )
    {
      if(c >= '0' && c <= '9')
        return (c - '0');
      if(c >= 'A' && c <= 'F')
        return (c - 'A' + 10);
      if(c >= 'a' && c <= 'f')
        return (c - 'a' + 10);
      return -1;
    }

    /**
      <summary>Evaluate whether a character is a delimiter [PDF:1.6:3.1.1].</summary>
    */
    protected static bool IsDelimiter(
      int c
      )
    {
      return (c == '(' || c == ')' || c == '<' || c == '>' || c == '[' || c == ']' || c == '/' || c == '%');
    }

    /**
      <summary>Evaluate whether a character is an EOL marker [PDF:1.6:3.1.1].</summary>
    */
    protected static bool IsEOL(
      int c
      )
    {
      return (c == 12 || c == 15);
    }

    /**
      <summary>Evaluate whether a character is a white-space [PDF:1.6:3.1.1].</summary>
    */
    protected static bool IsWhitespace(
      int c
      )
    {
      return (c == 0 || c == 9 || c == 10 || c == 12 || c == 13 || c == 32);
    }
    #endregion
    #endregion
    #endregion

    #region dynamic
    #region fields
    private readonly PdfDataObject contentStream;

    private long basePosition;
    private bytes::IInputStream stream;
    private int streamIndex = -1;
    private object token;
    private TokenTypeEnum tokenType;
    #endregion

    #region constructors
    internal Parser(
      PdfDataObject contentStream
      )
    {
      this.contentStream = contentStream;

      MoveNextStream();
    }
    #endregion

    #region interface
    #region public
    /**
      <summary>Gets the content stream on which parsing is done.</summary>
      <remarks>A content stream may be made up of either a single stream
      or an array of streams.</remarks>
    */
    public PdfDataObject ContentStream
    {get{return contentStream;}}

    public long Length
    {
      get
      {
        if(contentStream is PdfStream) // Single stream.
          return ((PdfStream)contentStream).Body.Length;
        else // Array of streams.
        {
          long length = 0;
          foreach(PdfDirectObject stream in (PdfArray)contentStream)
          {length += ((PdfStream)((PdfReference)stream).DataObject).Body.Length;}
          return length;
        }
      }
    }

    /**
      <param name="offset">Number of tokens to be skipped before reaching the
      intended one.</param>
    */
    public bool MoveNext(
      int offset
      )
    {
      for(
        int index = 0;
        index < offset;
        index++
        )
      {
        if(!MoveNext())
          return false;
      }

      return true;
    }

    /**
      <summary>Parse the next token [PDF:1.6:3.1].</summary>
      <remarks>
        <para>Contract:<ul>
          <li>Preconditions:
            <ol>
            <li>To properly parse the current token, the pointer MUST be just before its starting (leading whitespaces are ignored).</li>
            </ol>
          </li>
          <li>Postconditions:
            <ol>
            <li id="moveNext_contract_post[0]">When this method terminates, the pointer IS at the last byte of the current token.</li>
            </ol>
          </li>
          <li>Invariants:
            <ol>
            <li>The byte-level position of the pointer IS anytime (during token parsing) at the end of the current token (whereas the 'current token' represents the token-level position of the pointer).</li>
            </ol>
          </li>
          <li>Side-effects:
            <ol>
            <li>See <a href="#moveNext_contract_post[0]">Postconditions</a>.</li>
            </ol>
          </li>
        </ul></para>
      </remarks>
      <returns>Whether a new token was found.</returns>
    */
    public bool MoveNext(
      )
    {
      if(stream == null)
        return false;

      /*
        NOTE: It'd be interesting to evaluate an alternative regular-expression-based
        implementation...
      */
      int c = 0;

      // Skip white-space characters [PDF:1.6:3.1.1].
      while(true)
      {
        c = stream.ReadByte();
        if(c == -1)
        {
          /* NOTE: Current stream has finished. */
          // Move to the next stream!
          MoveNextStream();
          // No more streams?
          if(stream == null)
            return false;
        }
        else if(!IsWhitespace(c)) // Keep goin' till there's a white-space character...
        {break;}
      }

      StringBuilder buffer = null;
      token = null;
      // Which character is it?
      switch(c)
      {
        case '/': // Name.
          tokenType = TokenTypeEnum.Name;

          buffer = new StringBuilder();
          while(true)
          {
            c = stream.ReadByte();
            if(c == -1)
              throw new FileFormatException("Unexpected EOF (malformed name object).",stream.Position);
            if(IsDelimiter(c) || IsWhitespace(c))
              break;
            // Is it an hexadecimal code [PDF:1.6:3.2.4]?
            if(c == '#')
            {c = (GetHex(stream.ReadByte()) << 4) + GetHex(stream.ReadByte());}

            buffer.Append((char)c);
          }
          stream.Skip(-1); // Recover the first byte after the current token.

          break;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
        case '.':
        case '-':
        case '+': // Number [PDF:1.6:3.2.2] | Indirect reference.
          switch(c)
          {
            case '.': // Decimal point.
              tokenType = TokenTypeEnum.Real;
              break;
            default: // Digit or signum.
              tokenType = TokenTypeEnum.Integer; // By default (it may be real).
              break;
          }

          // Building the number...
          buffer = new StringBuilder();
          do
          {
            buffer.Append((char)c);
            c = stream.ReadByte();
            if(c == -1)
              throw new FileFormatException("Unexpected EOF (malformed number object).",stream.Position);
            if(c == '.')
              tokenType = TokenTypeEnum.Real;
            else if(c < '0' || c > '9')
              break;
          } while(true);

          stream.Skip(-1); // Recover the first byte after the current token.

          break;
        case '[': // Array (begin).
          tokenType = TokenTypeEnum.ArrayBegin;

          break;
        case ']': // Array (end).
          tokenType = TokenTypeEnum.ArrayEnd;

          break;
        case '<': // Dictionary (begin) | Hexadecimal string.
          c = stream.ReadByte();
          if(c == -1)
            throw new FileFormatException("Unexpected EOF (isolated opening angle-bracket character).",stream.Position);
          // Is it a dictionary (2nd angle bracket [PDF:1.6:3.2.6])?
          if(c == '<')
          {
            tokenType = TokenTypeEnum.DictionaryBegin;
            break;
          }

          // Hexadecimal string (single angle bracket [PDF:1.6:3.2.3]).
          tokenType = TokenTypeEnum.Hex;

          // [FIX:0.0.4:4] It skipped after the first hexadecimal character, missing it.
          buffer = new StringBuilder();
          while(c != '>') // NOT string end.
          {
            buffer.Append((char)c);

            c = stream.ReadByte();
            if(c == -1)
              throw new FileFormatException("Unexpected EOF (malformed hex string).",stream.Position);
          }

          break;
        case '>': // Dictionary (end).
          c = stream.ReadByte();
          if(c != '>')
            throw new FileFormatException("Malformed dictionary.",stream.Position);

          tokenType = TokenTypeEnum.DictionaryEnd;

          break;
        case '%': // Comment.
          tokenType = TokenTypeEnum.Comment;
          // Skipping comment content...
          do
          {
            c = stream.ReadByte();
            if(c == -1)
              break;
          } while(!IsEOL(c));

          break;
        case '(': // Literal string.
          tokenType = TokenTypeEnum.Literal;

          buffer = new StringBuilder();
          int level = 0;
          while(true)
          {
            c = stream.ReadByte();
            if(c == -1)
              break;
            if(c == '(')
              level++;
            else if(c == ')')
              level--;
            else if(c == '\\')
            {
              bool lineBreak = false;
              c = stream.ReadByte();
              switch(c)
              {
                case 'n':
                  c = '\n';
                  break;
                case 'r':
                  c = '\r';
                  break;
                case 't':
                  c = '\t';
                  break;
                case 'b':
                  c = '\b';
                  break;
                case 'f':
                  c = '\f';
                  break;
                case '(':
                case ')':
                case '\\':
                  break;
                case '\r':
                  lineBreak = true;
                  c = stream.ReadByte();
                  if(c != '\n')
                    stream.Skip(-1);
                  break;
                case '\n':
                  lineBreak = true;
                  break;
                default:
                {
                  // Is it outside the octal encoding?
                  if(c < '0' || c > '7') break;

                  // Octal [PDF:1.6:3.2.3].
                  int octal = c - '0';
                  c = stream.ReadByte();
                  // Octal end?
                  if(c < '0' || c > '7')
                  {c = octal; stream.Skip(-1); break;}
                  octal = (octal << 3) + c - '0';
                  c = stream.ReadByte();
                  // Octal end?
                  if(c < '0' || c > '7')
                  {c = octal; stream.Skip(-1); break;}
                  octal = (octal << 3) + c - '0';
                  c = octal & 0xff;
                  break;
                }
              }
              if(lineBreak)
                continue;
              if(c == -1)
                break;
            }
            else if(c == '\r')
            {
              c = stream.ReadByte();
              if(c == -1)
                break;
              if(c != '\n')
              {c = '\n'; stream.Skip(-1);}
            }
            if(level == -1)
              break;

            buffer.Append((char)c);
          }
          if(c == -1)
            throw new FileFormatException("Malformed literal string.",stream.Position);

          break;
        default: // Keyword.
          tokenType = TokenTypeEnum.Keyword;

          buffer = new StringBuilder();
          do
          {
            buffer.Append((char)c);
            c = stream.ReadByte();
            if(c == -1)
              break;
          } while(!IsDelimiter(c) && !IsWhitespace(c));
          stream.Skip(-1); // Recover the first byte after the current token.

          break;
      }

      if(buffer != null)
      {
        /*
          Here we prepare the current token state.
        */
        // Wich token type?
        switch(tokenType)
        {
          case TokenTypeEnum.Keyword:
            token = buffer.ToString();
            // Late recognition.
            switch((string)token)
            {
              case "false":
              case "true": // Boolean.
                tokenType = TokenTypeEnum.Boolean;
                token =  bool.Parse((string)token);
                break;
              case "null": // Null.
                tokenType = TokenTypeEnum.Null;
                token = null;
                break;
            }
            break;
          case TokenTypeEnum.Comment:
          case TokenTypeEnum.Hex:
          case TokenTypeEnum.Name:
            token = buffer.ToString();
            break;
          case TokenTypeEnum.Literal:
            token = buffer.ToString();
            // Late recognition.
            if(((string)token).StartsWith("D:")) // Date.
            {
              tokenType = TokenTypeEnum.Date;
              token = PdfDate.ToDateTime((string)token);
            }
            break;
          case TokenTypeEnum.Integer:
            token = Int32.Parse(
              buffer.ToString(),
              NumberStyles.Integer,
              StandardNumberFormatInfo
              );
            break;
          case TokenTypeEnum.Real:
            // [FIX:1668410] Parsing of float numbers was buggy (localized default number format).
            token = Single.Parse(
              buffer.ToString(),
              NumberStyles.Float,
              StandardNumberFormatInfo
              );
            break;
        }
      }

      return true;
    }

    /**
      <summary>Parses the next content object [PDF:1.6:4.1], may it be a single operation
      or a graphics object.</summary>
    */
    public ContentObject ParseContentObject(
      )
    {
      Operation operation = ParseOperation();
      if(operation is PaintXObject) // External object.
        return new XObject(operation);
      if(operation is PaintShadingObject) // Shading object.
        return new ShadingObject(operation);
      if(operation is BeginSubpath
        || operation is DrawRectangle) // Path.
        return ParsePath(operation);
      if(operation is BeginText) // Text.
        return new Text(
          ParseContentObjects<Operation>()
          );
      if(operation is SaveGraphicsState) // Local graphics state.
        return new LocalGraphicsState(
          ParseContentObjects<ContentObject>()
          );
      if(operation is BeginMarkedContent) // Marked-content sequence.
        return new MarkedContent(
          (BeginMarkedContent)operation,
          ParseContentObjects<ContentObject>()
          );
      if(operation is BeginInlineImage) // Inline image.
        return ParseInlineImage();

      // Single operation.
      return operation;
    }

    public List<T> ParseContentObjects<T>(
      ) where T : ContentObject
    {
      List<T> contentObjects = new List<T>();
      while(MoveNext())
      {
        T contentObject = (T)ParseContentObject();
        // Multiple-operation graphics object end?
        if(contentObject is EndText // Text.
          || contentObject is RestoreGraphicsState // Local graphics state.
          || contentObject is EndMarkedContent // End marked-content sequence.
          || contentObject is EndInlineImage) // Inline image.
          return contentObjects;

        contentObjects.Add(contentObject);
      }
      return contentObjects;
    }

    public Operation ParseOperation(
      )
    {
      string operator_ = null;
      List<PdfDirectObject> operands = new List<PdfDirectObject>();
      // Parsing the operation parts...
      while(true)
      {
        // Did we reach the operator keyword?
        if(tokenType == TokenTypeEnum.Keyword)
        {
          operator_ = (string)token;
          break;
        }

        operands.Add(ParsePdfObject()); MoveNext();
      }

      return Operation.Get(operator_,operands);
    }

    /**
      <remarks>
        <para>Require[0]: when this method is invoked, the pointer MUST be at (the end of) the first
        token of the object.</para>
        <para>Ensure[0]: when this method terminates, the pointer IS at (the end of) the last token of the object.</para>
        <para>Invariant[0]: stream data IS kept untouched.</para>
        <para>Side effect[0]: see Ensure[0].</para>
      </remarks>
    */
    public PdfDirectObject ParsePdfObject(
      )
    {
      switch(tokenType)
      {
        case TokenTypeEnum.Integer:
          return new PdfInteger((int)token);
        case TokenTypeEnum.Name:
          return new PdfName((string)token,true);
        case TokenTypeEnum.Literal:
          return new PdfString(
            ISO88591Encoding.GetBytes((string)token),
            PdfString.SerializationModeEnum.Literal
            );
        case TokenTypeEnum.DictionaryBegin:
          PdfDictionary dictionary = new PdfDictionary();
          // Populate the dictionary.
          while(true)
          {
            // Key.
            MoveNext();
            if(tokenType == TokenTypeEnum.DictionaryEnd)
              break;
            PdfName key = (PdfName)ParsePdfObject();

            // Value.
            MoveNext();
            PdfDirectObject value = (PdfDirectObject)ParsePdfObject();

            // Add the current entry to the dictionary!
            dictionary[key] = value;
          }
          return dictionary;
        case TokenTypeEnum.ArrayBegin:
          PdfArray array = new PdfArray();
          // Populate the array.
          while(true)
          {
            // Value.
            MoveNext();
            if(tokenType == TokenTypeEnum.ArrayEnd)
              break;

            // Add the current item to the array!
            array.Add((PdfDirectObject)ParsePdfObject());
          }
          return array;
        case TokenTypeEnum.Real:
          return new PdfReal((float)token);
        case TokenTypeEnum.Boolean:
          return new PdfBoolean((bool)token);
        case TokenTypeEnum.Date:
          return new PdfDate((DateTime)token);
        case TokenTypeEnum.Hex:
          return new PdfString(
            ISO88591Encoding.GetBytes((string)token),
            PdfString.SerializationModeEnum.Hex
            );
        case TokenTypeEnum.Null:
          return null;
        default:
          throw new Exception("Unknown type: " + tokenType);
      }
    }

    public long Position
    {get{return basePosition + stream.Position;}}

    public void Seek(
      long position
      )
    {
      while(true)
      {
        if(position < basePosition) //Before current stream.
        {
          if(!MovePreviousStream())
            throw new ArgumentException("Lower than acceptable.","position");
        }
        else if(position > basePosition + stream.Length) // After current stream.
        {
          if(!MoveNextStream())
            throw new ArgumentException("Higher than acceptable.","position");
        }
        else // At current stream.
        {
          stream.Seek(position - basePosition);
          break;
        }
      }
    }

    public void Skip(
      long offset
      )
    {
      while(true)
      {
        long position = stream.Position + offset;
        if(position < 0) //Before current stream.
        {
          offset += stream.Position;
          if(!MovePreviousStream())
            throw new ArgumentException("Lower than acceptable.","offset");

          stream.Position = stream.Length;
        }
        else if(position > stream.Length) // After current stream.
        {
          offset -= (stream.Length - stream.Position);
          if(!MoveNextStream())
            throw new ArgumentException("Higher than acceptable.","offset");
        }
        else // At current stream.
        {
          stream.Skip(position);
          break;
        }
      }
    }

    /**
      <summary>Move to the last whitespace after the current position in order
      to let read the first non-whitespace.</summary>
    */
    public bool SkipWhitespace(
      )
    {
      int b;
      do
      {
        b = stream.ReadByte();
        if(b == -1)
          return false;
      } while(IsWhitespace(b)); // Keep goin' till there's a white-space character...
      // Recover the last whitespace position!
      stream.Skip(-1); // Recover the last whitespace position.

      return true;
    }

    /**
      <summary>Gets the current stream.</summary>
    */
    public bytes::IInputStream Stream
    {get{return stream;}}

    /**
      <summary>Gets the current stream index.</summary>
    */
    public int StreamIndex
    {get{return streamIndex;}}

    /**
      <summary>Gets the currently-parsed token.</summary>
      <returns>The current token.</returns>
    */
    public object Token
    {get{return token;}}

    /**
      <summary>Gets the currently-parsed token type.</summary>
      <returns>The current token type.</returns>
    */
    public TokenTypeEnum TokenType
    {get{return tokenType;}}
    #endregion

    #region private
    private bool MoveNextStream(
      )
    {
      /* NOTE: A content stream may be made up of multiple streams [PDF:1.6:3.6.2]. */
      // Is the content stream just a single stream?
      if(contentStream is PdfStream) // Single stream.
      {
        if(streamIndex == 0)
        {
          streamIndex++;
          basePosition += stream.Length;
          stream = null;
        }
        if(streamIndex == 1)
          return false;

        streamIndex++;
        basePosition = 0;
        stream = ((PdfStream)contentStream).Body;
      }
      else // Array of streams.
      {
        PdfArray streams = (PdfArray)contentStream;
        if(streamIndex == (streams.Count -1))
        {
          streamIndex++;
          basePosition += stream.Length;
          stream = null;
        }
        if(streamIndex == streams.Count)
          return false;

        streamIndex++;
        if(streamIndex == 0)
        {basePosition = 0;}
        else
        {basePosition += stream.Length;}
        stream = ((PdfStream)((PdfReference)streams[streamIndex]).DataObject).Body;
      }
      return true;
    }

    private bool MovePreviousStream(
      )
    {
      if(streamIndex == 0)
      {
        streamIndex--;
        stream = null;
      }
      if(streamIndex == -1)
        return false;

      streamIndex--;
      /* NOTE: A content stream may be made up of multiple streams [PDF:1.6:3.6.2]. */
      // Is the content stream just a single stream?
      if(contentStream is PdfStream) // Single stream.
      {
        stream = ((PdfStream)contentStream).Body;
        basePosition = 0;
      }
      else // Array of streams.
      {
        PdfArray streams = (PdfArray)contentStream;

        stream = ((PdfStream)((PdfReference)streams[streamIndex]).DataObject).Body;
        basePosition -= stream.Length;
      }

      return true;
    }

    private InlineImage ParseInlineImage(
      )
    {
      /*
        NOTE: Inline images use a peculiar syntax that's an exception to the usual rule
        that the data in a content stream is interpreted according to the standard PDF syntax
        for objects.
      */
      InlineImageHeader header;
      {
        List<PdfDirectObject> operands = new List<PdfDirectObject>();
        // Parsing the image entries...
        while(tokenType != TokenTypeEnum.Keyword) // Not keyword (i.e. end at image data beginning (ID operator)).
        {operands.Add(ParsePdfObject()); MoveNext();}
        header = new InlineImageHeader(operands);
      }

      InlineImageBody body;
      {
        MoveNext();
        bytes::Buffer data = new bytes::Buffer();
        byte c1 = 0, c2 = 0;
        do
        {
          try
          {
            while(true)
            {
              c1 = (byte)stream.ReadByte();
              c2 = (byte)stream.ReadByte();
              if(c1 == 'E' && c2 == 'I')
                break;

              data.Append(c1);
              data.Append(c2);
            } break;
          }
          catch(Exception e)
          {
            /* NOTE: Current stream has finished. */
            // Move to the next stream!
            MoveNextStream();
          }
        } while(stream != null);
        body = new InlineImageBody(data);
      }

      return new InlineImage(
        header,
        body
        );
    }

    private Path ParsePath(
      Operation beginOperation
      )
    {
      /*
        NOTE: Paths do not have an explicit end operation, so we must infer it
        looking for the first non-painting operation.
      */
      List<Operation> operations = new List<Operation>();
      operations.Add(beginOperation);
      long position = Position;
      bool closeable = false;
      while(MoveNext())
      {
        Operation operation = ParseOperation();
        // Multiple-operation graphics object closeable?
        if(operation is PaintPath) // Painting operation.
        {closeable = true;}
        else if(closeable) // Past end (first non-painting operation).
        {
          Seek(position); // Rolls back to the last path-related operation.

          break;
        }

        operations.Add(operation);

        position = Position;
      }

      return new Path(operations);
    }
    #endregion
    #endregion
    #endregion
  }
}
www.java2v.com | Contact Us
All other trademarks are property of their respective owners.