/*
Copyright 2006,2007,2008 Stefano Chizzolini. http://clown.stefanochizzolini.it
Contributors:
* Stefano Chizzolini (original code developer, http://www.stefanochizzolini.it)
* Haakan Aakerberg (bugfix contributor):
- [FIX:0.0.4:1]
- [FIX:0.0.4:4]
This file should be part of the source code distribution of "PDF Clown library"
(the Program): see the accompanying README files for more info.
This Program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later version.
This Program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY, either expressed or implied; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the License for more details.
You should have received a copy of the GNU General Public License along with this
Program (see README files); if not, go to the GNU website (http://www.gnu.org/).
Redistribution and use, with or without modification, are permitted provided that such
redistributions retain the above copyright notice, license and disclaimer, along with
this list of conditions.
*/
using it.stefanochizzolini.clown.bytes;
using it.stefanochizzolini.clown.documents;
using it.stefanochizzolini.clown.files;
using it.stefanochizzolini.clown.objects;
using System;
using System.Globalization;
using System.IO;
using System.Text;
namespace it.stefanochizzolini.clown.tokens{
/**
<summary>Token parser.</summary>
*/
public class Parser : IDisposable
{
#region types
public struct Reference
{
#region fields
public readonly int ObjectNumber;
public readonly int GenerationNumber;
#endregion
#region constructors
internal Reference(
int objectNumber,
int generationNumber
)
{
this.ObjectNumber = objectNumber;
this.GenerationNumber = generationNumber;
}
#endregion
}
#endregion
#region static
#region fields
private static readonly string PdfHeader = "%PDF-";
private static readonly Encoding ISO88591Encoding = Encoding.GetEncoding("iso-8859-1");
private static readonly NumberFormatInfo StandardNumberFormatInfo = NumberFormatInfo.InvariantInfo;
#endregion
#region interface
#region protected
protected static int GetHex(
int c
)
{
if(c >= '0' && c <= '9')
return (c - '0');
if(c >= 'A' && c <= 'F')
return (c - 'A' + 10);
if(c >= 'a' && c <= 'f')
return (c - 'a' + 10);
return -1;
}
/**
<summary>Evaluate whether a character is a delimiter [PDF:1.6:3.1.1].</summary>
*/
protected static bool IsDelimiter(
int c
)
{return (c == '(' || c == ')' || c == '<' || c == '>' || c == '[' || c == ']' || c == '/' || c == '%');}
/**
<summary>Evaluate whether a character is an EOL marker [PDF:1.6:3.1.1].</summary>
*/
protected static bool IsEOL(
int c
)
{return (c == 12 || c == 15);}
/**
<summary>Evaluate whether a character is a white-space [PDF:1.6:3.1.1].</summary>
*/
protected static bool IsWhitespace(
int c
)
{return (c == 0 || c == 9 || c == 10 || c == 12 || c == 13 || c == 32);}
#endregion
#endregion
#endregion
#region dynamic
#region fields
private files.File file;
private IInputStream stream;
private object token;
private TokenTypeEnum tokenType;
private bool multipleTokenParsing;
#endregion
#region constructors
internal Parser(
IInputStream stream,
files.File file
)
{
this.stream = stream;
this.file = file;
}
#endregion
#region interface
#region public
public override int GetHashCode(
)
{return stream.GetHashCode();}
public long Length
{get{return stream.Length;}}
/**
<param name="offset">Number of tokens to be skipped before reaching the
intended one.</param>
*/
public bool MoveNext(
int offset
)
{
for(
int index = 0;
index < offset;
index++
)
{
if(!MoveNext())
return false;
}
return true;
}
/**
<summary>Parse the next token [PDF:1.6:3.1].</summary>
<remarks>
<para>Contract:<ul>
<li>Preconditions:
<ol>
<li>To properly parse the current token, the pointer MUST be just before its starting (leading whitespaces are ignored).</li>
</ol>
</li>
<li>Postconditions:
<ol>
<li id="moveNext_contract_post[0]">When this method terminates, the pointer IS at the last byte of the current token.</li>
</ol>
</li>
<li>Invariants:
<ol>
<li>The byte-level position of the pointer IS anytime (during token parsing) at the end of the current token (whereas the 'current token' represents the token-level position of the pointer).</li>
</ol>
</li>
<li>Side-effects:
<ol>
<li>See <a href="#moveNext_contract_post[0]">Postconditions</a>.</li>
</ol>
</li>
</ul></para>
</remarks>
<returns>Whether a new token was found.</returns>
*/
public bool MoveNext(
)
{
/*
NOTE: It'd be interesting to evaluate an alternative regular-expression-based
implementation...
*/
StringBuilder buffer = null;
token = null;
int c = 0;
// Skip white-space characters [PDF:1.6:3.1.1].
do
{
c = stream.ReadByte();
if(c == -1)
return false;
} while(IsWhitespace(c)); // Keep goin' till there's a white-space character...
// Which character is it?
switch(c)
{
case '/': // Name.
tokenType = TokenTypeEnum.Name;
buffer = new StringBuilder();
while(true)
{
c = stream.ReadByte();
if(c == -1)
throw new FileFormatException("Unexpected EOF (malformed name object).",stream.Position);
if(IsDelimiter(c) || IsWhitespace(c))
break;
buffer.Append((char)c);
}
stream.Skip(-1); // Recover the first byte after the current token.
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '.':
case '-':
case '+': // Number [PDF:1.6:3.2.2] | Indirect reference.
switch(c)
{
case '.': // Decimal point.
tokenType = TokenTypeEnum.Real;
break;
case '-':
case '+': // Signum.
tokenType = TokenTypeEnum.Integer; // By default (it may be real).
break;
default: // Digit.
if(multipleTokenParsing) // Plain number (multiple token parsing -- see indirect reference search).
{
tokenType = TokenTypeEnum.Integer; // By default (it may be real).
}
else // Maybe an indirect reference (postfix notation [PDF:1.6:3.2.9]).
{
/*
NOTE: We need to identify this pattern:
ref := { int int 'R' }
*/
// Enable multiple token parsing!
// NOTE: This state MUST be disabled before returning.
multipleTokenParsing = true;
// 1. Object number.
// Try the possible object number!
stream.Skip(-1); MoveNext();
// Isn't it a valid object number?
if(tokenType != TokenTypeEnum.Integer)
{
// Disable multiple token parsing!
multipleTokenParsing = false;
return true;
}
// Assign object number!
int objectNumber = (int)token;
// Backup the recovery position!
long oldOffset = stream.Position;
// 2. Generation number.
// Try the possible generation number!
MoveNext();
// Isn't it a valid generation number?
if(tokenType != TokenTypeEnum.Integer)
{
// Rollback!
stream.Seek(oldOffset);
token = objectNumber; tokenType = TokenTypeEnum.Integer;
// Disable multiple token parsing!
multipleTokenParsing = false;
return true;
}
// Assign generation number!
int generationNumber = (int)token;
// 3. Reference keyword.
// Try the possible reference keyword!
MoveNext();
// Isn't it a valid reference keyword?
if(tokenType != TokenTypeEnum.Reference)
{
// Rollback!
stream.Seek(oldOffset);
token = objectNumber; tokenType = TokenTypeEnum.Integer;
// Disable multiple token parsing!
multipleTokenParsing = false;
return true;
}
token = new Reference(objectNumber,generationNumber);
// Disable multiple token parsing!
multipleTokenParsing = false;
return true;
}
break;
}
// Building the number...
buffer = new StringBuilder();
do
{
buffer.Append((char)c);
c = stream.ReadByte();
if(c == -1)
throw new FileFormatException("Unexpected EOF (malformed number object).",stream.Position);
if(c == '.')
tokenType = TokenTypeEnum.Real;
else if(c < '0' || c > '9')
break;
} while(true);
stream.Skip(-1); // Recover the first byte after the current token.
break;
case '[': // Array (begin).
tokenType = TokenTypeEnum.ArrayBegin;
break;
case ']': // Array (end).
tokenType = TokenTypeEnum.ArrayEnd;
break;
case '<': // Dictionary (begin) | Hexadecimal string.
c = stream.ReadByte();
if(c == -1)
throw new FileFormatException("Unexpected EOF (isolated opening angle-bracket character).",stream.Position);
// Is it a dictionary (2nd angle bracket [PDF:1.6:3.2.6])?
if(c == '<')
{
tokenType = TokenTypeEnum.DictionaryBegin;
break;
}
// Hexadecimal string (single angle bracket [PDF:1.6:3.2.3]).
tokenType = TokenTypeEnum.Hex;
// [FIX:0.0.4:4] It skipped after the first hexadecimal character, missing it.
buffer = new StringBuilder();
while(c != '>') // NOT string end.
{
buffer.Append((char)c);
c = stream.ReadByte();
if(c == -1)
throw new FileFormatException("Unexpected EOF (malformed hex string).",stream.Position);
}
break;
case '>': // Dictionary (end).
c = stream.ReadByte();
if(c != '>')
throw new FileFormatException("Malformed dictionary.",stream.Position);
tokenType = TokenTypeEnum.DictionaryEnd;
break;
case '%': // Comment.
tokenType = TokenTypeEnum.Comment;
// Skipping comment content...
do
{
c = stream.ReadByte();
if(c == -1)
break;
} while(!IsEOL(c));
break;
case '(': // Literal string.
tokenType = TokenTypeEnum.Literal;
buffer = new StringBuilder();
int level = 0;
while(true)
{
c = stream.ReadByte();
if(c == -1)
break;
if(c == '(')
level++;
else if(c == ')')
level--;
else if(c == '\\')
{
bool lineBreak = false;
c = stream.ReadByte();
switch(c)
{
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case 'b':
c = '\b';
break;
case 'f':
c = '\f';
break;
case '(':
case ')':
case '\\':
break;
case '\r':
lineBreak = true;
c = stream.ReadByte();
if(c != '\n')
stream.Skip(-1);
break;
case '\n':
lineBreak = true;
break;
default:
{
// Is it outside the octal encoding?
if(c < '0' || c > '7') break;
// Octal [PDF:1.6:3.2.3].
int octal = c - '0';
c = stream.ReadByte();
// Octal end?
if(c < '0' || c > '7')
{c = octal; stream.Skip(-1); break;}
octal = (octal << 3) + c - '0';
c = stream.ReadByte();
// Octal end?
if(c < '0' || c > '7')
{c = octal; stream.Skip(-1); break;}
octal = (octal << 3) + c - '0';
c = octal & 0xff;
break;
}
}
if(lineBreak)
continue;
if(c == -1)
break;
}
else if(c == '\r')
{
c = stream.ReadByte();
if(c == -1)
break;
if(c != '\n')
{c = '\n'; stream.Skip(-1);}
}
if(level == -1)
break;
buffer.Append((char)c);
}
if(c == -1)
throw new FileFormatException("Malformed literal string.",stream.Position);
break;
case 'R': // Indirect reference.
tokenType = TokenTypeEnum.Reference;
break;
default: // Keyword object.
tokenType = TokenTypeEnum.Keyword;
buffer = new StringBuilder();
do
{
buffer.Append((char)c);
c = stream.ReadByte();
if(c == -1)
break;
} while(!IsDelimiter(c) && !IsWhitespace(c));
stream.Skip(-1); // Recover the first byte after the current token.
break;
}
if(buffer != null)
{
/*
Here we prepare the current token state.
*/
// Wich token type?
switch(tokenType)
{
case TokenTypeEnum.Keyword:
token = buffer.ToString();
// Late recognition.
switch((string)token)
{
case "false":
case "true": // Boolean.
tokenType = TokenTypeEnum.Boolean;
token = bool.Parse((string)token);
break;
case "null": // Null.
tokenType = TokenTypeEnum.Null;
token = null;
break;
}
break;
case TokenTypeEnum.Comment:
case TokenTypeEnum.Hex:
case TokenTypeEnum.Name:
token = buffer.ToString();
break;
case TokenTypeEnum.Literal:
token = buffer.ToString();
// Late recognition.
if(((string)token).StartsWith("D:")) // Date.
{
tokenType = TokenTypeEnum.Date;
token = PdfDate.ToDateTime((string)token);
}
break;
case TokenTypeEnum.Integer:
token = Int32.Parse(
buffer.ToString(),
NumberStyles.Integer,
StandardNumberFormatInfo
);
break;
case TokenTypeEnum.Real:
// [FIX:1668410] Parsing of float numbers was buggy (localized default number format).
token = Single.Parse(
buffer.ToString(),
NumberStyles.Float,
StandardNumberFormatInfo
);
break;
}
}
return true;
}
/**
<remarks>
<para>Require[0]: when this method is invoked, the pointer MUST be at (the end of) the first
token of the object.</para>
<para>Ensure[0]: when this method terminates, the pointer IS at (the end of) the last token of the object.</para>
<para>Invariant[0]: stream data IS kept untouched.</para>
<para>Side effect[0]: see Ensure[0].</para>
</remarks>
*/
public PdfDataObject ParsePdfObject(
)
{
/*
NOTE: Object parsing is intrinsically a sequential operation tied to the stream pointer.
Calls bound towards other classes are potentially disruptive for the predictability of
the position of the stream pointer, so we are forced to carefully keep track of our
current position in order to recover its proper state after any outbound call.
*/
// Which token type?
switch(tokenType)
{
case TokenTypeEnum.Integer:
return new PdfInteger((int)token);
case TokenTypeEnum.Name:
return new PdfName((string)token,true);
case TokenTypeEnum.Reference:
/*
NOTE: Curiously, PDF references are the only primitive objects that require
a file reference. That's because they deal with indirect objects, which are strongly
coupled with the current state of the file: so, PDF references are the fundamental
bridge between the token layer and the file layer.
*/
return new PdfReference(
(Reference)token,
file
);
case TokenTypeEnum.Literal:
return new PdfTextString(
ISO88591Encoding.GetBytes((string)token)
);
case TokenTypeEnum.DictionaryBegin:
PdfDictionary dictionary = new PdfDictionary();
// Populate the dictionary.
while(true)
{
// Key.
MoveNext();
if(tokenType == TokenTypeEnum.DictionaryEnd)
break;
PdfName key = (PdfName)ParsePdfObject();
// Value.
MoveNext();
PdfDirectObject value = (PdfDirectObject)ParsePdfObject();
// Add the current entry to the dictionary!
dictionary[key] = value;
}
int oldOffset = (int)stream.Position;
MoveNext();
// Is this dictionary the header of a stream object [PDF:1.6:3.2.7]?
if((tokenType == TokenTypeEnum.Keyword)
&& token.Equals("stream"))
{
// Keep track of current position!
long position = stream.Position;
// Get the stream length!
/*
NOTE: Indirect reference resolution is an outbound call (stream pointer hazard!),
so we need to recover our current position after it returns.
*/
int length = ((PdfInteger)files.File.Resolve(dictionary[PdfName.Length])).RawValue;
// Come back to current position!
stream.Seek(position);
SkipWhitespace();
// Copy the stream data to the instance!
byte[] data = new byte[length];
stream.Read(data);
MoveNext(); // Postcondition (last token should be 'endstream' keyword).
return new PdfStream(
dictionary,
new bytes.Buffer(data)
);
}
else
{
stream.Seek(oldOffset); // Restore postcondition (last token should be the dictionary end).
return dictionary;
}
case TokenTypeEnum.ArrayBegin:
PdfArray array = new PdfArray();
// Populate the array.
while(true)
{
// Value.
MoveNext();
if(tokenType == TokenTypeEnum.ArrayEnd)
break;
// Add the current item to the array!
array.Add((PdfDirectObject)ParsePdfObject());
}
return array;
case TokenTypeEnum.Real:
return new PdfReal((float)token);
case TokenTypeEnum.Boolean:
return new PdfBoolean((bool)token);
case TokenTypeEnum.Date:
return new PdfDate((DateTime)token);
case TokenTypeEnum.Hex:
return new PdfString(
ISO88591Encoding.GetBytes((string)token),
PdfString.SerializationModeEnum.Hex
);
case TokenTypeEnum.Null:
return null;
default:
throw new Exception("Unknown type: " + tokenType);
}
}
/**
<summary>Retrieves the PDF version of the file [PDF:1.6:3.4.1].</summary>
*/
public string RetrieveVersion(
)
{
stream.Seek(0);
string header = stream.ReadString(10);
if(!header.StartsWith(PdfHeader))
throw new FileFormatException("PDF header not found.",stream.Position);
return header.Substring(PdfHeader.Length,3);
}
/**
<summary>Retrieves the starting position of the last xref-table
section.</summary>
*/
public long RetrieveXRefOffset(
)
{return RetrieveXRefOffset(stream.Length);}
/**
<summary>Retrieves the starting position of an xref-table section
[PDF:1.6:3.4.4].</summary>
<remarks>
<para>Require: offset parameter MUST be the position of the EOF marker related to the intended section to be parsed.</para>
</remarks>
*/
public long RetrieveXRefOffset(
long offset
)
{
const int chunkSize = 1024; // [PDF:1.6:H.3.18].
// Move back before 'startxref' keyword!
long position = offset - chunkSize;
if (position < 0)
{position = 0;} // [FIX:0.0.4:1] It failed to deal with less-than-1024-byte-long PDF files.
stream.Seek(position);
// Get 'startxref' keyword position!
int index = stream.ReadString(chunkSize).LastIndexOf("startxref");
if(index < 0)
throw new FileFormatException("PDF startxref not found.",stream.Position);
// Go past the startxref keyword!
stream.Seek(position + index); MoveNext();
// Go to the xref offset!
MoveNext();
if(tokenType != TokenTypeEnum.Integer)
throw new FileFormatException("PDF startxref malformed.",stream.Position);
return (int)token;
}
public long Position
{get{return stream.Position;}}
public void Seek(
long offset
)
{stream.Seek(offset);}
public void Skip(
long offset
)
{stream.Skip(offset);}
/**
<summary>Move to the last whitespace after the current position in order
to let read the first non-whitespace.</summary>
*/
public bool SkipWhitespace(
)
{
int b;
do
{
b = stream.ReadByte();
if(b == -1)
return false;
} while(IsWhitespace(b)); // Keep goin' till there's a white-space character...
// Recover the last whitespace position!
stream.Skip(-1); // Recover the last whitespace position.
return true;
}
public IInputStream Stream
{get{return stream;}}
/**
<summary>Gets the currently-parsed token.</summary>
<returns>The current token.</returns>
*/
public object Token
{get{return token;}}
/**
<summary>Gets the currently-parsed token type.</summary>
<returns>The current token type.</returns>
*/
public TokenTypeEnum TokenType
{get{return tokenType;}}
#region IDisposable
public void Dispose(
)
{
if(stream != null)
{
stream.Dispose();
stream = null;
}
GC.SuppressFinalize(this);
}
#endregion
#endregion
#endregion
#endregion
}
}
|