using it.stefanochizzolini.clown.documents;
using it.stefanochizzolini.clown.documents.contents.objects;
using it.stefanochizzolini.clown.files;
using it.stefanochizzolini.clown.objects;
using System;
using System.Collections.Generic;
using System.Text;
namespace it.stefanochizzolini.clown.samples{
/**
<summary>This sample is a rough stub that demonstrates a basic way to extract text
from a document.</summary>
<remarks>
<para>This implementation is definitely simplistic: its purpose is NOT to provide
a real-life solution for PDF text mining; it lacks advanced features such as
character encoding management, glyph position detection, dehyphenation,
page-breaks handling and so on. Its purpose is to test the new content-stream parsing
functionality.</para>
<para>So, read my lips: this-is-just-a-toy (for now!).</para>
</remarks>
*/
public class TextExtractionSample
: ISample
{
private static readonly Encoding ISO88591Encoding = Encoding.GetEncoding("iso-8859-1");
public void Run(
SampleLoader loader
)
{
// (boilerplate user choice -- ignore it)
string filePath = loader.GetPdfFileChoice("Please select a PDF file");
// Open the PDF file!
File file = new File(filePath);
// Get the PDF document!
Document document = file.Document;
// Get the page collection!
Pages pages = document.Pages;
//TODO:IMPL see PDF:1.6:5.9
foreach(Page page in pages)
{
Console.WriteLine("\nScanning page " + (page.Index+1) + "...\n");
Extract(page.Contents);
Console.WriteLine("\nENTER (Scan next page)");
Console.WriteLine("[Q] (End scanning)");
Console.Write("Please select: ");
try
{
string choice = Console.ReadLine();
if(choice.ToUpper().Equals("Q")) // Quit.
break;
}
catch{}
}
}
private void Extract(
IList<ContentObject> objects
)
{
foreach(ContentObject obj in objects)
{
if(obj is Text)
{
foreach(Operation operation in ((Text)obj).Objects)
{
bool hit = false;
string operator_ = operation.Operator;
if(operator_.Equals("TJ")
|| operator_.Equals("Tj")
|| operator_.Equals("'")
|| operator_.Equals("''")
)
{
foreach(PdfDirectObject operand in operation.Operands)
{
/*
NOTE: This algorythm doesn't deal with the character encoding
used to represent the font glyphs, which may be arbitrarily defined;
therefore, its text output has no direct relation with the actual
Unicode text it stays for (to say: you may see weird sequences of
symbols instead of the expected plain text).
PDF Clown will implement font decoding support in the next releases
through the documents.contents.fonts.Font.decode(byte[]) method.
*/
if(operand is PdfString)
{
hit = true;
Console.Write(ISO88591Encoding.GetString(((PdfString)operand).RawValue) + " "); // NOTE: This line outputs the internal text representation, NOT the actual plain text (decoding to do yet!).
}
else if(operand is PdfArray)
{
foreach(PdfDirectObject item in ((PdfArray)operand))
{
if(item is PdfString)
{
hit = true;
Console.Write(ISO88591Encoding.GetString(((PdfString)item).RawValue) + " "); // NOTE: This line outputs the internal text representation, NOT the actual plain text (decoding to do yet!).
}
}
}
}
if(hit)
{Console.WriteLine();}
}
}
}
else if(obj is ContainerObject)
{Extract(((ContainerObject)obj).Objects);}
}
}
}
}
|